modesty / pdf2json

converts binary PDF to JSON and text, for server-side PDF processing and command-line use.

Home Page:https://github.com/modesty/pdf2json

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Text X positions Incorrect

mainfraame opened this issue · comments

The document I am working is a 11.5 x 16 PDF document. The height I get back from pdf2json is 51.75, which when examining the Text's locations (x,y), and assuming that they also are represented as page units (PU), the y seems to be correct. However, the x seems to be off for elements located on the right half of the document. For instance, I placed text ("BottomRight") in the bottom right and got back the following coordinates: { x: 193.45312500000003, y: 50.918749999999996 }. Seeing that the document is 11.5 x 16, and the PU for the height are 51.75, this would technically make the width 74.25 PU. How is it possible that a text can have a position of 193.45..., with a max PU of 74.25?

define(function(require,exports,modules){

```
var fs        = require('fs'),
    _         = require('underscore-node'),
    PDFParser = require('pdf2json/pdfparser'),
    pdfParser = new PDFParser(),
    pdfutils = require('pdfutils').pdfutils;


var PDF = function(base,file){

    var pdf = this;

    var location = '/Users/dayne/sites/wl/client/products/';

    pdf.base = null;
    pdf.file = null;

    pdf.adors = [];
    pdf.pages = [];

    pdf.init = function(base,file){

        console.log('starting pdf parsing');

        // set base path + file name
        pdf.file = file;
        pdf.base = base;

        // set the bindings
        pdfParser.on("pdfParser_dataReady", _.bind(pdf.initParse, this));
        pdfParser.on("pdfParser_dataError", _.bind(pdf.parseDataError, this));

        // start parsing
        pdfParser.loadPDF(base + file);
    };

    pdf.initParse = function(data){
```

//            console.log('parsing pdf data');

```
        pdfutils(pdf.base + pdf.file, function(err,doc){
```

//                for(var i = 0; i < data.PDFJS.pages.length; i++)
                for(var i = 0; i < 1; i++)
                    pdf.pages.push(pdf.parsePage(data.PDFJS.pages[i],doc[i]));

//                console.log(data.PDFJS.pages[0]);
            });

```
    };

    pdf.parsePage = function(page,doc){

        var parsedPage = {};

        parsedPage.adors  = [];

        parsedPage.ratio  = doc.height / page.Height;
        parsedPage.width  = doc.width;
        parsedPage.height = doc.height;

        for(var i = 0; i < page.Texts.length; i++)
            pdf.findCamelCase( page.Texts[i].R[0].T, page.Texts[i], page.Texts[i].R[0].TS, parsedPage, parsedPage.ratio);

        // TODO:: find solution for this xml parsing (grabbing pictures)...
```

//            console.log(parsedPage);
//            var meta   = doc.metadata.split('\n');
//            doc[0].asPNG({maxWidth: doc[0].width, maxHeight: doc[0].height }).toFile( pdf.base + 'test.png' )
            return parsedPage;
        };

```
    pdf.findCamelCase = function(text,textLocation,textData,parsedPage,ratio){
        // TODO :: fix regex to only accept camelcase without spacing...

        text.replace(/[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*/g, function(match){

            var t = {};
```

//                console.log(textLocation.x);
//                console.log(ratio);

```
            t.text    = text;
            t.size    = textData[1];
            t.bold    = textData[2] == 1;
            t.italics = textData[3] == 1;
            t.position = {
                x: textLocation.x,
                y: textLocation.y
            };
```

//                console.log(textLocation.x);
                console.log(t.text, t.position);

```
            parsedPage.adors.push(t);
        });
    };

    pdf.parseDataError = function(err){

        console.log('pdf parse error...',err);
    };

    pdf.init(base,file);
};

return new PDF('/Users/dayne/sites/wl/server/utils/','test.pdf');
```

});

Just found that it fixed itself, when I changed the following in lib/pdfunit.js line 12:

var gridXPerInch = 4.0; (11.0) * why was this at 11 and not 4?
var gridYPerInch = 4.0;

fixed in v1.0.8

when i used above code i got error like this..
node pro1.js
/home/hades3/Desktop/converter/pro1.js:1
(function (exports, require, module, __filename, __dirname) { define(function(require,exports,modules){
^

ReferenceError: define is not defined
at Object. (/home/hades3/Desktop/converter/pro1.js:1:63)
at Module._compile (module.js:570:32)
at Object.Module._extensions..js (module.js:579:10)
at Module.load (module.js:487:32)
at tryModuleLoad (module.js:446:12)
at Function.Module._load (module.js:438:3)
at Module.runMain (module.js:604:10)
at run (bootstrap_node.js:389:7)
at startup (bootstrap_node.js:149:9)
at bootstrap_node.js:504:3