oduwsdl / hypercane

The stopwords for hc report terms are currently hardcoded. Even worse, the are hard coded only in the sumgram code and not the general n-gram code.

hypercane/hypercane/report/sumgrams.py

Lines 53 to 162 in 2e071d7

    
           # TODO: load these from a file 
        
           added_stopwords = [ 
        
               "associated press", 
        
               "com", 
        
               "donald trump", 
        
               "fox news", 
        
               "abc news", 
        
               "getty images", 
        
               "last month", 
        
               "last week", 
        
               "last year", 
        
               "pic", 
        
               "pinterest reddit", 
        
               "pm et", 
        
               "president donald", 
        
               "president donald trump", 
        
               "president trump", 
        
               "president trump's", 
        
               "print mail", 
        
               "reddit print", 
        
               "said statement", 
        
               "send whatsapp", 
        
               "sign up", 
        
               "trump administration", 
        
               "trump said", 
        
               "twitter", 
        
               "united states", 
        
               "washington post", 
        
               "white house", 
        
               "whatsapp pinterest", 
        
               "subscribe whatsapp", 
        
               "york times", 
        
               "privacy policy", 
        
               "terms use" 
        
           ] 
        
           added_stopwords.append( "{} read".format(last_year) ) 
        
           added_stopwords.append( "{} read".format(current_year) ) 
        
           stopmonths = [ 
        
               "january", 
        
               "february", 
        
               "march", 
        
               "april", 
        
               "may", 
        
               "june", 
        
               "july", 
        
               "august", 
        
               "september", 
        
               "october", 
        
               "november", 
        
               "december" 
        
           ] 
        
           # add just the month to the stop words 
        
           added_stopwords.extend(stopmonths) 
        
           stopmonths_short = [ 
        
               "jan", 
        
               "feb", 
        
               "mar", 
        
               "apr", 
        
               "may", 
        
               "jun", 
        
               "jul", 
        
               "aug", 
        
               "sep", 
        
               "oct", 
        
               "nov", 
        
               "dec" 
        
           ] 
        
           added_stopwords.extend(stopmonths_short) 
        
           # add the day of the week, too 
        
           added_stopwords.extend([ 
        
               "monday", 
        
               "tuesday", 
        
               "wednesday", 
        
               "thursday", 
        
               "friday", 
        
               "saturday", 
        
               "sunday" 
        
           ]) 
        
           added_stopwords.extend([ 
        
               "mon", 
        
               "tue", 
        
               "wed", 
        
               "thu", 
        
               "fri", 
        
               "sat", 
        
               "sun" 
        
           ]) 
        
           # for i in range(1, 13): 
        
           #     added_stopwords.append( 
        
           #         datetime(current_year, i, current_date).strftime('%b %Y') 
        
           #     ) 
        
           #     added_stopwords.append( 
        
           #         datetime(last_year, i, current_date).strftime('%b %Y') 
        
           #     ) 
        
           # for i in range(1, 13): 
        
           #     added_stopwords.append( 
        
           #         datetime(current_year, i, current_date).strftime('%B %Y') 
        
           #     ) 
        
           #     added_stopwords.append( 
        
           #         datetime(last_year, i, current_date).strftime('%B %Y') 
        
           #     )

The generic terms report will need to accept the same stopword list at get_document_tokens:

hypercane/hypercane/report/terms.py

Lines 6 to 28 in 2e071d7

    
           def get_document_tokens(urim, cache_storage, ngram_length): 
        
               from hypercane.utils import get_boilerplate_free_content 
        
               from nltk.corpus import stopwords 
        
               from nltk import word_tokenize, ngrams 
        
               import string 
        
               # TODO: stoplist based on language of the document 
        
               stoplist = list(set(stopwords.words('english'))) 
        
               punctuation = [ i for i in string.punctuation ] 
        
               additional_stopchars = [ '’', '‘', '“', '”', '•', '·', '—', '–', '›', '»'] 
        
               stop_numbers = [ str(i) for i in range(0, 11) ] 
        
               allstop = stoplist + punctuation + additional_stopchars + stop_numbers 
        
               content = get_boilerplate_free_content(urim, cache_storage=cache_storage) 
        
               doc_tokens = word_tokenize(content.decode('utf8').lower()) 
        
               doc_tokens = [ token for token in doc_tokens if token not in allstop ] 
        
               table = str.maketrans('', '', string.punctuation) 
        
               doc_tokens = [ w.translate(table) for w in doc_tokens ] 
        
               doc_tokens = [ w for w in doc_tokens if len(w) > 0 ] 
        
               doc_ngrams = ngrams(doc_tokens, ngram_length) 
        
               return list(doc_ngrams)

See Automatically Building a Stopword List for an Information Retrieval System for an idea on how we might automatically compute stopwords. I suspect that we need to include stopwords elsewhere to improve the results of DSA1. With this realization, we might want to give this a little more thought before just testing and releasing the recent code changes.

	# TODO: load these from a file
	added_stopwords = [
	"associated press",
	"com",
	"donald trump",
	"fox news",
	"abc news",
	"getty images",
	"last month",
	"last week",
	"last year",
	"pic",
	"pinterest reddit",
	"pm et",
	"president donald",
	"president donald trump",
	"president trump",
	"president trump's",
	"print mail",
	"reddit print",
	"said statement",
	"send whatsapp",
	"sign up",
	"trump administration",
	"trump said",
	"twitter",
	"united states",
	"washington post",
	"white house",
	"whatsapp pinterest",
	"subscribe whatsapp",
	"york times",
	"privacy policy",
	"terms use"
	]

	added_stopwords.append( "{} read".format(last_year) )
	added_stopwords.append( "{} read".format(current_year) )

	stopmonths = [
	"january",
	"february",
	"march",
	"april",
	"may",
	"june",
	"july",
	"august",
	"september",
	"october",
	"november",
	"december"
	]

	# add just the month to the stop words
	added_stopwords.extend(stopmonths)

	stopmonths_short = [
	"jan",
	"feb",
	"mar",
	"apr",
	"may",
	"jun",
	"jul",
	"aug",
	"sep",
	"oct",
	"nov",
	"dec"
	]

	added_stopwords.extend(stopmonths_short)

	# add the day of the week, too
	added_stopwords.extend([
	"monday",
	"tuesday",
	"wednesday",
	"thursday",
	"friday",
	"saturday",
	"sunday"
	])

	added_stopwords.extend([
	"mon",
	"tue",
	"wed",
	"thu",
	"fri",
	"sat",
	"sun"
	])

	# for i in range(1, 13):
	# added_stopwords.append(
	# datetime(current_year, i, current_date).strftime('%b %Y')
	# )
	# added_stopwords.append(
	# datetime(last_year, i, current_date).strftime('%b %Y')
	# )

	# for i in range(1, 13):
	# added_stopwords.append(
	# datetime(current_year, i, current_date).strftime('%B %Y')
	# )
	# added_stopwords.append(
	# datetime(last_year, i, current_date).strftime('%B %Y')
	# )

	def get_document_tokens(urim, cache_storage, ngram_length):

	from hypercane.utils import get_boilerplate_free_content
	from nltk.corpus import stopwords
	from nltk import word_tokenize, ngrams
	import string

	# TODO: stoplist based on language of the document
	stoplist = list(set(stopwords.words('english')))
	punctuation = [ i for i in string.punctuation ]
	additional_stopchars = [ '’', '‘', '“', '”', '•', '·', '—', '–', '›', '»']
	stop_numbers = [ str(i) for i in range(0, 11) ]
	allstop = stoplist + punctuation + additional_stopchars + stop_numbers

	content = get_boilerplate_free_content(urim, cache_storage=cache_storage)
	doc_tokens = word_tokenize(content.decode('utf8').lower())
	doc_tokens = [ token for token in doc_tokens if token not in allstop ]
	table = str.maketrans('', '', string.punctuation)
	doc_tokens = [ w.translate(table) for w in doc_tokens ]
	doc_tokens = [ w for w in doc_tokens if len(w) > 0 ]
	doc_ngrams = ngrams(doc_tokens, ngram_length)

	return list(doc_ngrams)

Make stopword list for hc report terms configurable