IsADirectoryError: [Errno 21] Is a directory: '/data/bdlml/mkabra/nltk_data/corpora/stopwords'
minniekabra opened this issue · comments
I am getting this error while using keybert, and I have no idea on how to resolve it.
IsADirectoryError: [Errno 21] Is a directory: '/data/bdlml/mkabra/nltk_data/corpora/stopwords'
I deleted the directory, and then it says OSError: No such file or directory: '/data/bdlml/mkabra/nltk_data/corpora/stopwords'
Can you please help with it?
Extracting KeyPhrases: 0%| | 0/1 [00:00<?, ?it/s]
---------------------------------------------------------------------------
IsADirectoryError Traceback (most recent call last)
/tmp/ipykernel_49851/2407001818.py in <module>
45 df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = [output]
46 else:
---> 47 df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = KPE.extract_phrases_all(docs)
48
49 df['phrases_all_cmpln_summary'] = df['phrases_all_cmpln_summary'].astype(str)
/tmp/ipykernel_49851/4174669147.py in extract_phrases_all(self, x, top)
42 def extract_phrases_all(self, x, top=20):
43 x = self.preprocess(x)
---> 44 keyphrases = self.kw_model.extract_keywords(x, vectorizer=self.kcv, top_n=top, stop_words=None, use_mmr=True, diversity=0.7)
45 return keyphrases
46
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keybert/_model.py in extract_keywords(self, docs, candidates, keyphrase_ngram_range, stop_words, top_n, min_df, use_maxsum, use_mmr, diversity, nr_candidates, vectorizer, highlight, seed_keywords, doc_embeddings, word_embeddings)
144 # Extract potential words using a vectorizer / tokenizer
145 if vectorizer:
--> 146 count = vectorizer.fit(docs)
147 else:
148 try:
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py in fit(self, raw_documents)
204
205 self._check_params()
--> 206 X = super().fit_transform(raw_documents)
207 self._tfidf.fit(X)
208 return self
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keyphrase_vectorizers/keyphrase_count_vectorizer.py in fit_transform(self, raw_documents)
220
221 # fit
--> 222 KeyphraseCountVectorizer.fit(self=self, raw_documents=raw_documents)
223
224 # transform
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keyphrase_vectorizers/keyphrase_count_vectorizer.py in fit(self, raw_documents)
167 lowercase=self.lowercase, workers=self.workers,
168 spacy_exclude=self.spacy_exclude,
--> 169 custom_pos_tagger=self.custom_pos_tagger)
170
171 # remove keyphrases that have more than 8 words, as they are probably no real keyphrases
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py in _get_pos_keyphrases(self, document_list, stop_words, spacy_pipeline, pos_pattern, spacy_exclude, custom_pos_tagger, lowercase, workers)
286 if isinstance(stop_words, str):
287 try:
--> 288 stop_words_list = set(nltk.corpus.stopwords.words(stop_words))
289 except LookupError:
290 logger = logging.getLogger('KeyphraseVectorizer')
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/nltk/corpus/reader/wordlist.py in words(self, fileids, ignore_lines_startswith)
19 return [
20 line
---> 21 for line in line_tokenize(self.raw(fileids))
22 if not line.startswith(ignore_lines_startswith)
23 ]
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/nltk/corpus/reader/api.py in raw(self, fileids)
216 contents = []
217 for f in fileids:
--> 218 with self.open(f) as fp:
219 contents.append(fp.read())
220 return concat(contents)
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/nltk/corpus/reader/api.py in open(self, file)
229 """
230 encoding = self.encoding(file)
--> 231 stream = self._root.join(file).open(encoding)
232 return stream
233
~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/nltk/data.py in open(self, encoding)
322
323 def open(self, encoding=None):
--> 324 stream = open(self._path, "rb")
325 if encoding is not None:
326 stream = SeekableUnicodeStreamReader(stream, encoding)
IsADirectoryError: [Errno 21] Is a directory: '/data/bdlml/mkabra/nltk_data/corpora/stopwords'
Code
class KeyPhraseExtractor():
def __init__(self, kw_model):
self.kw_model = kw_model
self.kcv = KeyphraseTfidfVectorizer(stop_words='', pos_pattern = '<R.*|MD|J.*>*<VB.*>+<.*>{0,2}<R.*|J.*>+<.*>{0,2}<N.*>+|<J.*|N.*>+<R.*>+<.*>{0,2}<J.*>*<N.*>+|<R.*>+<.*>{0,2}<R.*|MD|J.*>*<VB.*|J.*|N.*>+|<.*>{0,2}<R.*|MD|J.*>*<VB.*>+<.*>{0,2}<N.*|R.*|J.*>+|<R.*|MD|J.*>*<VB.*>+<.*>{0,2}<N.*>+|<.*>{0,2}<N.*>+<R.*|MD|J.*>*<VB.*>+<.*>{0,2}<N.*>*' )
def preprocess(self, x):
if type(x) == list:
for i in range(len(x)):
x[i] = str(x[i]).replace(".",". ").replace("//",". ")
else:
x = str(x).replace(".",". ").replace("//",". ")
return x
return keyphrases
def extract_phrases_all(self, x, top=20):
x = self.preprocess(x)
keyphrases = self.kw_model.extract_keywords(x, vectorizer=self.kcv, top_n=top, stop_words=None, use_mmr=True, diversity=0.7)
return keyphrases
# Function describing the list of accepted arguments
kw_model = KeyBERT(model='ProsusAI/finbert')
df=pd.read_csv(input_path+input_file+'.csv')
df=df.loc[:10]
print(len(df))
column = 'cmpln_summary'
new_column = 'cmpln_summary'
#nlp = spacy.load("en_core_web_sm")
KPE = KeyPhraseExtractor(kw_model)
df['phrases_all_cmpln_summary'] = None
batch_size = 256
text_list = df[new_column].astype(str).tolist()
for i in tqdm(range(len(text_list) // batch_size + 1), desc='Extracting KeyPhrases'):
docs = text_list[i*batch_size:(i+1)*batch_size]
if len(docs) == 1:
output = KPE.extract_phrases_all(docs[0])
if output == []:
df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = [['None Found']]
else:
df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = [output]
else:
df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = KPE.extract_phrases_all(docs)
I think something went wrong with the installation of keywords using the nltk
package. Perhaps running the following solves your issue:
import nltk
nltk.download('stopwords')
If that does not work, I would advise posting an issue on the KeyphraseVectorizers Github page since this is not an issue with KeyBERT. Having said that, I do not think it is an issue with KeyphraseVectorizers itself but merely how nltk was installed/downloaded.