KeyError: xml:id
raffaem opened this issue · comments
Raffaele Mancuso commented
Unfortunately I cannot share the PDF as it is proprietary:
KeyError Traceback (most recent call last)
Cell In [6], line 8
6 for app in apps:
7 app = app.resolve()
----> 8 article_dict = scipdf.parse_pdf_to_dict(str(app))
9 ref = article_dict["references"]
10 ref = [l for l in ref if is_valid(l)]
File ~/.local/lib/python3.10/site-packages/scipdf/pdf/parse_pdf.py:358, in parse_pdf_to_dict(pdf_path, fulltext, soup, as_list, grobid_url)
339 """
340 Parse the given PDF and return dictionary of the parsed article
341
(...)
353 article_dict: dict, dictionary of an article
354 """
355 parsed_article = parse_pdf(
356 pdf_path, fulltext=fulltext, soup=soup, grobid_url=grobid_url
357 )
--> 358 article_dict = convert_article_soup_to_dict(parsed_article, as_list=as_list)
359 return article_dict
File ~/.local/lib/python3.10/site-packages/scipdf/pdf/parse_pdf.py:321, in convert_article_soup_to_dict(article, as_list)
319 article_dict["sections"] = parse_sections(article, as_list=as_list)
320 article_dict["references"] = parse_references(article)
--> 321 article_dict["figures"] = parse_figure_caption(article)
323 doi = article.find("idno", attrs={"type": "DOI"})
324 doi = doi.text if doi is not None else ""
File ~/.local/lib/python3.10/site-packages/scipdf/pdf/parse_pdf.py:260, in parse_figure_caption(article)
258 for figure in figures:
259 figure_type = figure.attrs.get("type") or ""
--> 260 figure_id = figure.attrs["xml:id"] or ""
261 label = figure.find("label").text
262 if figure_type == "table":
KeyError: 'xml:id'