Make stopword list for hc report terms configurable
shawnmjones opened this issue · comments
The stopwords for hc report terms
are currently hardcoded. Even worse, the are hard coded only in the sumgram code and not the general n-gram code.
hypercane/hypercane/report/sumgrams.py
Lines 53 to 162 in 2e071d7
# TODO: load these from a file | |
added_stopwords = [ | |
"associated press", | |
"com", | |
"donald trump", | |
"fox news", | |
"abc news", | |
"getty images", | |
"last month", | |
"last week", | |
"last year", | |
"pic", | |
"pinterest reddit", | |
"pm et", | |
"president donald", | |
"president donald trump", | |
"president trump", | |
"president trump's", | |
"print mail", | |
"reddit print", | |
"said statement", | |
"send whatsapp", | |
"sign up", | |
"trump administration", | |
"trump said", | |
"twitter", | |
"united states", | |
"washington post", | |
"white house", | |
"whatsapp pinterest", | |
"subscribe whatsapp", | |
"york times", | |
"privacy policy", | |
"terms use" | |
] | |
added_stopwords.append( "{} read".format(last_year) ) | |
added_stopwords.append( "{} read".format(current_year) ) | |
stopmonths = [ | |
"january", | |
"february", | |
"march", | |
"april", | |
"may", | |
"june", | |
"july", | |
"august", | |
"september", | |
"october", | |
"november", | |
"december" | |
] | |
# add just the month to the stop words | |
added_stopwords.extend(stopmonths) | |
stopmonths_short = [ | |
"jan", | |
"feb", | |
"mar", | |
"apr", | |
"may", | |
"jun", | |
"jul", | |
"aug", | |
"sep", | |
"oct", | |
"nov", | |
"dec" | |
] | |
added_stopwords.extend(stopmonths_short) | |
# add the day of the week, too | |
added_stopwords.extend([ | |
"monday", | |
"tuesday", | |
"wednesday", | |
"thursday", | |
"friday", | |
"saturday", | |
"sunday" | |
]) | |
added_stopwords.extend([ | |
"mon", | |
"tue", | |
"wed", | |
"thu", | |
"fri", | |
"sat", | |
"sun" | |
]) | |
# for i in range(1, 13): | |
# added_stopwords.append( | |
# datetime(current_year, i, current_date).strftime('%b %Y') | |
# ) | |
# added_stopwords.append( | |
# datetime(last_year, i, current_date).strftime('%b %Y') | |
# ) | |
# for i in range(1, 13): | |
# added_stopwords.append( | |
# datetime(current_year, i, current_date).strftime('%B %Y') | |
# ) | |
# added_stopwords.append( | |
# datetime(last_year, i, current_date).strftime('%B %Y') | |
# ) |
The generic terms report will need to accept the same stopword list at get_document_tokens
:
hypercane/hypercane/report/terms.py
Lines 6 to 28 in 2e071d7
def get_document_tokens(urim, cache_storage, ngram_length): | |
from hypercane.utils import get_boilerplate_free_content | |
from nltk.corpus import stopwords | |
from nltk import word_tokenize, ngrams | |
import string | |
# TODO: stoplist based on language of the document | |
stoplist = list(set(stopwords.words('english'))) | |
punctuation = [ i for i in string.punctuation ] | |
additional_stopchars = [ '’', '‘', '“', '”', '•', '·', '—', '–', '›', '»'] | |
stop_numbers = [ str(i) for i in range(0, 11) ] | |
allstop = stoplist + punctuation + additional_stopchars + stop_numbers | |
content = get_boilerplate_free_content(urim, cache_storage=cache_storage) | |
doc_tokens = word_tokenize(content.decode('utf8').lower()) | |
doc_tokens = [ token for token in doc_tokens if token not in allstop ] | |
table = str.maketrans('', '', string.punctuation) | |
doc_tokens = [ w.translate(table) for w in doc_tokens ] | |
doc_tokens = [ w for w in doc_tokens if len(w) > 0 ] | |
doc_ngrams = ngrams(doc_tokens, ngram_length) | |
return list(doc_ngrams) |
See Automatically Building a Stopword List for an Information Retrieval System for an idea on how we might automatically compute stopwords. I suspect that we need to include stopwords elsewhere to improve the results of DSA1. With this realization, we might want to give this a little more thought before just testing and releasing the recent code changes.