MaartenGr / KeyBERT

Minimal keyword extraction with BERT

Home Page:https://MaartenGr.github.io/KeyBERT/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

IsADirectoryError: [Errno 21] Is a directory: '/data/bdlml/mkabra/nltk_data/corpora/stopwords'

minniekabra opened this issue · comments

I am getting this error while using keybert, and I have no idea on how to resolve it.
IsADirectoryError: [Errno 21] Is a directory: '/data/bdlml/mkabra/nltk_data/corpora/stopwords'

I deleted the directory, and then it says OSError: No such file or directory: '/data/bdlml/mkabra/nltk_data/corpora/stopwords'

Can you please help with it?

Extracting KeyPhrases:   0%|          | 0/1 [00:00<?, ?it/s]
---------------------------------------------------------------------------
IsADirectoryError                         Traceback (most recent call last)
/tmp/ipykernel_49851/2407001818.py in <module>
     45             df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = [output]
     46     else:
---> 47         df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = KPE.extract_phrases_all(docs)
     48 
     49 df['phrases_all_cmpln_summary'] = df['phrases_all_cmpln_summary'].astype(str)

/tmp/ipykernel_49851/4174669147.py in extract_phrases_all(self, x, top)
     42     def extract_phrases_all(self, x, top=20):
     43         x = self.preprocess(x)
---> 44         keyphrases = self.kw_model.extract_keywords(x, vectorizer=self.kcv, top_n=top, stop_words=None, use_mmr=True, diversity=0.7)
     45         return keyphrases
     46 

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keybert/_model.py in extract_keywords(self, docs, candidates, keyphrase_ngram_range, stop_words, top_n, min_df, use_maxsum, use_mmr, diversity, nr_candidates, vectorizer, highlight, seed_keywords, doc_embeddings, word_embeddings)
    144         # Extract potential words using a vectorizer / tokenizer
    145         if vectorizer:
--> 146             count = vectorizer.fit(docs)
    147         else:
    148             try:

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py in fit(self, raw_documents)
    204 
    205         self._check_params()
--> 206         X = super().fit_transform(raw_documents)
    207         self._tfidf.fit(X)
    208         return self

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keyphrase_vectorizers/keyphrase_count_vectorizer.py in fit_transform(self, raw_documents)
    220 
    221         # fit
--> 222         KeyphraseCountVectorizer.fit(self=self, raw_documents=raw_documents)
    223 
    224         # transform

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keyphrase_vectorizers/keyphrase_count_vectorizer.py in fit(self, raw_documents)
    167                                                    lowercase=self.lowercase, workers=self.workers,
    168                                                    spacy_exclude=self.spacy_exclude,
--> 169                                                    custom_pos_tagger=self.custom_pos_tagger)
    170 
    171         # remove keyphrases that have more than 8 words, as they are probably no real keyphrases

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py in _get_pos_keyphrases(self, document_list, stop_words, spacy_pipeline, pos_pattern, spacy_exclude, custom_pos_tagger, lowercase, workers)
    286         if isinstance(stop_words, str):
    287             try:
--> 288                 stop_words_list = set(nltk.corpus.stopwords.words(stop_words))
    289             except LookupError:
    290                 logger = logging.getLogger('KeyphraseVectorizer')

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/nltk/corpus/reader/wordlist.py in words(self, fileids, ignore_lines_startswith)
     19         return [
     20             line
---> 21             for line in line_tokenize(self.raw(fileids))
     22             if not line.startswith(ignore_lines_startswith)
     23         ]

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/nltk/corpus/reader/api.py in raw(self, fileids)
    216         contents = []
    217         for f in fileids:
--> 218             with self.open(f) as fp:
    219                 contents.append(fp.read())
    220         return concat(contents)

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/nltk/corpus/reader/api.py in open(self, file)
    229         """
    230         encoding = self.encoding(file)
--> 231         stream = self._root.join(file).open(encoding)
    232         return stream
    233 

~/mls_python_venv/miniconda/envs/mk_ve_3/lib/python3.7/site-packages/nltk/data.py in open(self, encoding)
    322 
    323     def open(self, encoding=None):
--> 324         stream = open(self._path, "rb")
    325         if encoding is not None:
    326             stream = SeekableUnicodeStreamReader(stream, encoding)

IsADirectoryError: [Errno 21] Is a directory: '/data/bdlml/mkabra/nltk_data/corpora/stopwords'

Code

class KeyPhraseExtractor():
    
    def __init__(self, kw_model):
        self.kw_model = kw_model
        self.kcv = KeyphraseTfidfVectorizer(stop_words='', pos_pattern = '<R.*|MD|J.*>*<VB.*>+<.*>{0,2}<R.*|J.*>+<.*>{0,2}<N.*>+|<J.*|N.*>+<R.*>+<.*>{0,2}<J.*>*<N.*>+|<R.*>+<.*>{0,2}<R.*|MD|J.*>*<VB.*|J.*|N.*>+|<.*>{0,2}<R.*|MD|J.*>*<VB.*>+<.*>{0,2}<N.*|R.*|J.*>+|<R.*|MD|J.*>*<VB.*>+<.*>{0,2}<N.*>+|<.*>{0,2}<N.*>+<R.*|MD|J.*>*<VB.*>+<.*>{0,2}<N.*>*' )

    def preprocess(self, x):
        if type(x) == list:
            for i in range(len(x)):
                x[i] = str(x[i]).replace(".",". ").replace("//",". ")
        else:
            x = str(x).replace(".",". ").replace("//",". ")

        return x

        return keyphrases

    def extract_phrases_all(self, x, top=20):
        x = self.preprocess(x)
        keyphrases = self.kw_model.extract_keywords(x, vectorizer=self.kcv, top_n=top, stop_words=None, use_mmr=True, diversity=0.7)
        return keyphrases
    

    
# Function describing the list of accepted arguments
kw_model = KeyBERT(model='ProsusAI/finbert')
df=pd.read_csv(input_path+input_file+'.csv')
df=df.loc[:10]
print(len(df))
column = 'cmpln_summary'
new_column = 'cmpln_summary'

#nlp = spacy.load("en_core_web_sm")
KPE = KeyPhraseExtractor(kw_model)

df['phrases_all_cmpln_summary'] = None
batch_size = 256
text_list = df[new_column].astype(str).tolist()

for i in tqdm(range(len(text_list) // batch_size + 1), desc='Extracting KeyPhrases'):
    docs = text_list[i*batch_size:(i+1)*batch_size]
    if len(docs) == 1:
        output = KPE.extract_phrases_all(docs[0])
        if output == []:
            df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = [['None Found']]
        else:
            df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = [output]
    else:
        df.iloc[i*batch_size:min(len(text_list), (i+1)*batch_size), -1] = KPE.extract_phrases_all(docs)

I think something went wrong with the installation of keywords using the nltk package. Perhaps running the following solves your issue:

import nltk
nltk.download('stopwords')

If that does not work, I would advise posting an issue on the KeyphraseVectorizers Github page since this is not an issue with KeyBERT. Having said that, I do not think it is an issue with KeyphraseVectorizers itself but merely how nltk was installed/downloaded.