Using Positional Attributes.
CWB indexed corpora store the text of a corpus as numbers: Every token in the token stream of the corpus is identified by a unique corpus position. The string value of every token is identified by a unique integer id. The corpus library (CL) offers a set of functions to make the transitions between corpus positions, token ids, and the character string of tokens.
cl_cpos2str(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), cpos) cl_cpos2id(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), cpos) cl_id2str(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), id) cl_regex2id(corpus, p_attribute, regex, registry = Sys.getenv("CORPUS_REGISTRY")) cl_str2id(corpus, p_attribute, str, registry = Sys.getenv("CORPUS_REGISTRY")) cl_id2freq(corpus, p_attribute, id, registry = Sys.getenv("CORPUS_REGISTRY")) cl_id2cpos(corpus, p_attribute, id, registry = Sys.getenv("CORPUS_REGISTRY"))
corpus |
name of a CWB corpus (upper case) |
p_attribute |
a p-attribute (positional attribute) |
registry |
path to the registry directory, defaults to the value of the environment variable CORPUS_REGISTRY |
cpos |
corpus positions (integer vector) |
id |
id of a token |
regex |
a regular expression |
str |
a character string |
# registry directory and cpos_total will be needed in examples registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry() Sys.setenv(CORPUS_REGISTRY = registry) cpos_total <- cl_attribute_size( corpus = "REUTERS", attribute = "word", attribute_type = "p", registry = registry ) # decode the token stream of the corpus (the quick way) token_stream_str <- cl_cpos2str( corpus = "REUTERS", p_attribute = "word", cpos = seq.int(from = 0, to = cpos_total - 1), registry = registry ) # decode the token stream (cpos2id first, then id2str) token_stream_ids <- cl_cpos2id( corpus = "REUTERS", p_attribute = "word", cpos = seq.int(from = 0, to = cpos_total - 1), registry = registry ) token_stream_str <- cl_id2str( corpus = "REUTERS", p_attribute = "word", id = token_stream_ids, registry = registry ) # get corpus positions of a token token_to_get <- "oil" id_oil <- cl_str2id( corpus = "REUTERS", p_attribute = "word", str = token_to_get ) cpos_oil <- cl_id2cpos <- cl_id2cpos( corpus = "REUTERS", p_attribute = "word", id = id_oil ) # get frequency of token oil_freq <- cl_id2freq( corpus = "REUTERS", p_attribute = "word", id = id_oil ) length(cpos_oil) # needs to be the same as oil_freq # use regular expressions ids <- cl_regex2id( corpus = "REUTERS", p_attribute = "word", regex = "M.*" ) m_words <- cl_id2str( corpus = "REUTERS", p_attribute = "word", id = ids )
Please choose more modern alternatives, such as Google Chrome or Mozilla Firefox.