Main Cleaning and Tokenization Functions
The main functions used to clean and tokenize the data are clean_vector_source
, get_all_grams
, and get_gram_counts
. The goal is to have tokens that contain only ASCII characters, maybe hyphenated or include a hypostrophy, are not digits, and in any case.
# 1o Helper Functions
clean_vector_source = function(vector_source) {
## Given a character vector source,
## 1) Replaces all digits with their word equivalent
## 2) Removes all punctuation other than hyphens and apostrophes
## 3) Keeps hyphens only if they are connecting a compound word, as in "Part-time student"
## 4) Removes input lines with non-ASCII characters
## Returns a clean character vector
clean_output = vector_source %>%
str_num2words %>%
str_replace_all('[^\\sA-Za-z\'\\-]', '') %>%
str_replace_all('(?<!\\b[A-Za-z]{0,9})([-])(?![A-Za-z]+\\b)', "")
clean_output = clean_output[!str_detect(clean_output, "[^\x20-\x7E]")]
return(clean_output)
}
get_all_grams = function(tbl_source) {
## Given an input tbl with a character vector called "clean"
## Extracts all unigrams, bigrams, and trigrams from the character vector
## Returns a single data frame containing all tokens, labelled by the number of grams
unigrams = tbl_source %>% unnest_tokens(token, clean, token="ngrams", n = 1, to_lower=FALSE)
unigrams$n_gram = 'one'
bigrams = tbl_source %>% unnest_tokens(token, clean, token="ngrams", n = 2, to_lower=FALSE)
bigrams$n_gram = 'two'
trigrams = tbl_source %>% unnest_tokens(token, clean, token="ngrams", n = 3, to_lower=FALSE)
trigrams$n_gram = 'three'
return(bind_rows(unigrams,bigrams,trigrams))
}
# 2o Helper Functions
get_gram_counts = function(vector_source) {
## Given a character vector source,
## Runs clean_vector_source and get_all_grams
## Then, counts the number of instances of each unique token
## Returns a tbl data frame of all the tokens from the input vector
## Sorted by number of occurances and labelled by the number of grams in each token
all_tokens = data.frame(clean=clean_vector_source(vector_source), stringsAsFactors = FALSE) %>%
tbl_df %>%
get_all_grams %>%
group_by(n_gram) %>%
count(token, sort=TRUE)
return(all_tokens)
}