Splitting a pdf into pages and joining again

Question

Splitting a pdf into pages and joining again

BjornFJohansson opened this issue 5 months ago · comments

I made two scripts to split pdfs into pages and join pdfs. The join script fails to join pages that th split script produces.
Tested on the pdf attached. 2019_ReferenceWorkEntry_.pdf

What am I doing wrong?

Cell In[23], line 4
pdf = PDF.loads(pdf_file_handle)

File ~/miniforge3/envs/bjorn311/lib/python3.11/site-packages/borb/pdf/pdf.py:83 in loads
document: Document = ReadAnyObjectTransformer().transform(

...

  File ~/miniforge3/envs/bjorn311/lib/python3.11/site-packages/borb/io/read/tokenize/high_level_tokenizer.py:144 in read_indirect_object
    value = self.read_object()

  File ~/miniforge3/envs/bjorn311/lib/python3.11/site-packages/borb/io/read/tokenize/high_level_tokenizer.py:206 in read_object
    return self.read_dictionary()

  File ~/miniforge3/envs/bjorn311/lib/python3.11/site-packages/borb/io/read/tokenize/high_level_tokenizer.py:94 in read_dictionary
    assert token.get_token_type() == TokenType.NAME

AssertionError

split:

#!/home/bjorn/miniforge3/envs/bjorn311/bin python3
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
from tqdm import tqdm

script, *cliarg = sys.argv
pdfpaths = [Path(p) for p in cliarg] or 
[2019_ReferenceWorkEntry_.pdf](https://github.com/jorisschellekens/borb/files/14094251/2019_ReferenceWorkEntry_.pdf)
sorted(Path(".").glob("*.pdf"))

for pdfpath in tqdm(pdfpaths):

    fn = pdfpath.stem

    with open(pdfpath, "rb") as pdf_file_handle:
        pdf = PDF.loads(pdf_file_handle)

    number_of_pages = int(pdf.get_document_info().get_number_of_pages())

    for i in range(number_of_pages):
        print(i)
        outpdf = Document()
        outpdf.add_page(pdf.get_page(i))
        with open(f"{fn}_{i:03d}.pdf", "wb") as pdf_out_handle:
            PDF.dumps(pdf_out_handle, outpdf)

join:

#!/home/bjorn/miniforge3/envs/bjorn311/bin python3
# -*- coding: utf-8 -*-
# https://pdfstandalone.com/en/merge-pdf
import sys
from pathlib import Path
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
from tqdm import tqdm

script, *cliarg = sys.argv
pdfpaths = [Path(p) for p in cliarg] or sorted(Path(".").glob("*.pdf"))
output_document = Document()
outpath = Path("output.pdf")

try:
    pdfpaths.remove(outpath)
except ValueError:
    pass

for pdfpath in tqdm(pdfpaths):

    with open(pdfpath, "rb") as pdf_file_handle:
        pdf = PDF.loads(pdf_file_handle)
        output_document.add_document(pdf)

with open(outpath, "wb") as pdf_out_handle:
    PDF.dumps(pdf_out_handle, output_document)