@inproceedings{35bbe16f7f764a588f26336ebd686296,
title = "A large synchronous corpus as monitoring corpus: Some comparative content analysis of Chinese and Japanese language developments",
abstract = "Appropriate and large corpora are uncommon but they can provide important resources for wide ranging efforts in natural language processing, ranging from contextualized or localized speech and text input to automatic patent translation. They also provide lesser known rich resources for human and automatic content analysis such as sentiment analysis of texts and product reviews. Furthermore they can function as a monitoring corpus and enhance the human centered communication environment by allowing more substantive introspection and comparison of content rather than the linguistic form in communication. This paper discusses the methodological background of a very large and unique synchronous corpus of Chinese, LIVAC, which regularly and synchronously samples news media texts from 6 major Chinese cities and occasionally from Japan. For 16 continuous years, it has processed and analyzed more than 400 million characters of Chinese news media texts and culled more than 1.5 million basic lexical entries and useful information such as on their associated basic linguistic and usage characteristics. We make an attempt to capitalize on its synchronous nature and homothematic content and to use an innovative Windows approach to explore its use as a Monitoring Corpus by tracking and doing innovative and meaningful content analysis of salient cultural items. They include content rich words such as BAR and VEHICLE and their differential derivative development and usage within windows of different sizes and up to 10 years apart. It will be shown that based on the comparative analysis of the contents in the windows, salient information can be obtained on possible changes in the relative cultural orientations and mutual influences among the Chinese communities, and between Chinese and Japanese societies, and how innovative analysis has been made possible by using the LIVAC synchronous corpus as a monitoring corpus.",
keywords = "Chinese, Homothematic coprus, Japanese, Lingusitic and social variation, Monitoring corpus, Synchronous corpus",
author = "Tsou, {Benjamin K.} and Chin, {Andy C.}",
year = "2010",
doi = "10.1109/IUCS.2010.5666763",
language = "English",
isbn = "9781424478200",
series = "2010 4th International Universal Communication Symposium, IUCS 2010 - Proceedings",
pages = "90--96",
booktitle = "2010 4th International Universal Communication Symposium, IUCS 2010 - Proceedings",
note = "2010 4th International Universal Communication Symposium, IUCS 2010 ; Conference date: 18-10-2010 Through 19-10-2010",
}