Splitting a pdf into pages and joining again
BjornFJohansson opened this issue · comments
I made two scripts to split pdfs into pages and join pdfs. The join script fails to join pages that th split script produces.
Tested on the pdf attached. 2019_ReferenceWorkEntry_.pdf
What am I doing wrong?
Cell In[23], line 4
pdf = PDF.loads(pdf_file_handle)
File ~/miniforge3/envs/bjorn311/lib/python3.11/site-packages/borb/pdf/pdf.py:83 in loads
document: Document = ReadAnyObjectTransformer().transform(
...
File ~/miniforge3/envs/bjorn311/lib/python3.11/site-packages/borb/io/read/tokenize/high_level_tokenizer.py:144 in read_indirect_object
value = self.read_object()
File ~/miniforge3/envs/bjorn311/lib/python3.11/site-packages/borb/io/read/tokenize/high_level_tokenizer.py:206 in read_object
return self.read_dictionary()
File ~/miniforge3/envs/bjorn311/lib/python3.11/site-packages/borb/io/read/tokenize/high_level_tokenizer.py:94 in read_dictionary
assert token.get_token_type() == TokenType.NAME
AssertionError
split:
#!/home/bjorn/miniforge3/envs/bjorn311/bin python3
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
from tqdm import tqdm
script, *cliarg = sys.argv
pdfpaths = [Path(p) for p in cliarg] or
[2019_ReferenceWorkEntry_.pdf](https://github.com/jorisschellekens/borb/files/14094251/2019_ReferenceWorkEntry_.pdf)
sorted(Path(".").glob("*.pdf"))
for pdfpath in tqdm(pdfpaths):
fn = pdfpath.stem
with open(pdfpath, "rb") as pdf_file_handle:
pdf = PDF.loads(pdf_file_handle)
number_of_pages = int(pdf.get_document_info().get_number_of_pages())
for i in range(number_of_pages):
print(i)
outpdf = Document()
outpdf.add_page(pdf.get_page(i))
with open(f"{fn}_{i:03d}.pdf", "wb") as pdf_out_handle:
PDF.dumps(pdf_out_handle, outpdf)
join:
#!/home/bjorn/miniforge3/envs/bjorn311/bin python3
# -*- coding: utf-8 -*-
# https://pdfstandalone.com/en/merge-pdf
import sys
from pathlib import Path
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
from tqdm import tqdm
script, *cliarg = sys.argv
pdfpaths = [Path(p) for p in cliarg] or sorted(Path(".").glob("*.pdf"))
output_document = Document()
outpath = Path("output.pdf")
try:
pdfpaths.remove(outpath)
except ValueError:
pass
for pdfpath in tqdm(pdfpaths):
with open(pdfpath, "rb") as pdf_file_handle:
pdf = PDF.loads(pdf_file_handle)
output_document.add_document(pdf)
with open(outpath, "wb") as pdf_out_handle:
PDF.dumps(pdf_out_handle, output_document)