VaclavRut / actor-amazon-crawler

Amazon crawler - this configuration will extract items for a keywords that you will specify in the input, and it will automatically extract all pages for the given keyword. You can specify more keywords on the input for one run.

Home Page:https://apify.com

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Can't run crawler on apify

empeje opened this issue · comments

Hi @VaclavRut,

I have a project that uses your crawler in the past. We've decided to stop it few months ago and stop the Apify subscription. Recently we restart the project and try to run your crawler in Apify without subscription, but I keep getting error.

Input

{
  "country": "US",
  "directUrls": [
    {
      "url": "https://www.amazon.com/dp/B07P6Y8L3F",
      "userData": {
        "label": "detail",
        "keyword": "B07P6Y8L3F",
        "asin": "B07P6Y8L3F",
        "detailUrl": "https://www.amazon.com/dp/B07P6Y8L3F",
        "sellerUrl": "https://www.amazon.com/gp/offer-listing/B07P6Y8L3F"
      }
    }
  ],
  "maxResults": 1,
  "proxy": {
    "useApifyProxy": false
  }
}

Output

2020-05-16T05:34:52.631Z ACTOR: Creating Docker container.

2020-05-16T05:34:57.465Z ACTOR: Starting Docker container.
2020-05-16T05:34:58.769Z 
2020-05-16T05:34:58.769Z > amazon@1.0.0 start /usr/src/app
2020-05-16T05:34:58.770Z > node ./src/main.js
2020-05-16T05:34:58.771Z 
2020-05-16T05:35:00.372Z INFO: System info {"apifyVersion":"0.19.1","apifyClientVersion":"0.5.26","osType":"Linux","nodeVersion":"v12.16.1"}
2020-05-16T05:35:00.373Z WARNING: You are using an outdated version (0.19.1) of Apify SDK. We recommend you to update to the latest version (0.20.4).
2020-05-16T05:35:00.374Z          Read more about Apify SDK versioning at: https://help.apify.com/en/articles/3184510-updates-and-versioning-of-apify-sdk
2020-05-16T05:35:00.491Z INFO: Going to enqueue 1 requests from input.
2020-05-16T05:35:00.492Z https://www.amazon.com/dp/B07P6Y8L3F
2020-05-16T05:35:02.277Z INFO: AutoscaledPool state {"currentConcurrency":0,"desiredConcurrency":2,"systemStatus":{"isSystemIdle":true,"memInfo":{"isOverloaded":false,"limitRatio":0.2,"actualRatio":null},"eventLoopInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":null},"cpuInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":null},"clientInfo":{"isOverloaded":false,"limitRatio":0.3,"actualRatio":null}}}
2020-05-16T05:35:02.392Z ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.amazon.com/dp/B07P6Y8L3F","retryCount":1,"id":"CxCejBi58nyfOEt"}
2020-05-16T05:35:02.393Z   Error: Request for https://www.amazon.com/dp/B07P6Y8L3F aborted due to abortFunction
2020-05-16T05:35:02.393Z     at DuplexWrapper.<anonymous> (/usr/src/app/node_modules/@apify/http-request/src/index.js:167:25)
2020-05-16T05:35:02.394Z     at DuplexWrapper.emit (events.js:311:20)
2020-05-16T05:35:02.394Z     at EventEmitter.<anonymous> (/usr/src/app/node_modules/got/source/as-stream.js:60:9)
2020-05-16T05:35:02.395Z     at EventEmitter.emit (events.js:311:20)
2020-05-16T05:35:02.395Z     at module.exports (/usr/src/app/node_modules/got/source/get-response.js:22:10)
2020-05-16T05:35:02.396Z     at ClientRequest.handleResponse (/usr/src/app/node_modules/got/source/request-as-event-emitter.js:155:5)
2020-05-16T05:35:02.396Z     at Object.onceWrapper (events.js:418:26)
2020-05-16T05:35:02.397Z     at ClientRequest.emit (events.js:323:22)
2020-05-16T05:35:02.397Z     at ClientRequest.origin.emit (/usr/src/app/node_modules/@szmarczak/http-timer/source/index.js:37:11)
2020-05-16T05:35:02.398Z     at HTTPParser.parserOnIncomingClient [as onIncoming] (_http_client.js:603:27)
2020-05-16T05:35:02.398Z     at HTTPParser.parserOnHeadersComplete (_http_common.js:119:17)
2020-05-16T05:35:02.398Z     at Socket.socketOnData (_http_client.js:476:22)
2020-05-16T05:35:02.399Z     at Socket.emit (events.js:311:20)
2020-05-16T05:35:02.399Z     at Socket.Readable.read (_stream_readable.js:512:10)
2020-05-16T05:35:02.400Z     at Socket.read (net.js:618:39)
2020-05-16T05:35:02.401Z     at flow (_stream_readable.js:989:34)
2020-05-16T05:35:02.401Z     at resume_ (_stream_readable.js:970:3)
2020-05-16T05:35:02.402Z     at processTicksAndRejections (internal/process/task_queues.js:84:21)
2020-05-16T05:35:05.483Z ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.amazon.com/dp/B07P6Y8L3F","retryCount":2,"id":"CxCejBi58nyfOEt"}
2020-05-16T05:35:05.484Z   Error: Request for https://www.amazon.com/dp/B07P6Y8L3F aborted due to abortFunction
2020-05-16T05:35:05.484Z     at DuplexWrapper.<anonymous> (/usr/src/app/node_modules/@apify/http-request/src/index.js:167:25)
2020-05-16T05:35:05.485Z     at DuplexWrapper.emit (events.js:311:20)
2020-05-16T05:35:05.485Z     at EventEmitter.<anonymous> (/usr/src/app/node_modules/got/source/as-stream.js:60:9)
2020-05-16T05:35:05.486Z     at EventEmitter.emit (events.js:311:20)
2020-05-16T05:35:05.494Z     at module.exports (/usr/src/app/node_modules/got/source/get-response.js:22:10)
2020-05-16T05:35:05.494Z     at ClientRequest.handleResponse (/usr/src/app/node_modules/got/source/request-as-event-emitter.js:155:5)
2020-05-16T05:35:05.495Z     at Object.onceWrapper (events.js:418:26)
2020-05-16T05:35:05.495Z     at ClientRequest.emit (events.js:323:22)
2020-05-16T05:35:05.496Z     at ClientRequest.origin.emit (/usr/src/app/node_modules/@szmarczak/http-timer/source/index.js:37:11)
2020-05-16T05:35:05.496Z     at HTTPParser.parserOnIncomingClient [as onIncoming] (_http_client.js:603:27)
2020-05-16T05:35:05.496Z     at HTTPParser.parserOnHeadersComplete (_http_common.js:119:17)
2020-05-16T05:35:05.497Z     at Socket.socketOnData (_http_client.js:476:22)
2020-05-16T05:35:05.501Z     at Socket.emit (events.js:311:20)
2020-05-16T05:35:05.501Z     at Socket.Readable.read (_stream_readable.js:512:10)
2020-05-16T05:35:05.502Z     at Socket.read (net.js:618:39)
2020-05-16T05:35:05.502Z     at flow (_stream_readable.js:989:34)
2020-05-16T05:35:05.503Z     at resume_ (_stream_readable.js:970:3)
2020-05-16T05:35:05.504Z     at processTicksAndRejections (internal/process/task_queues.js:84:21)
2020-05-16T05:35:08.654Z ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.amazon.com/dp/B07P6Y8L3F","retryCount":3,"id":"CxCejBi58nyfOEt"}
2020-05-16T05:35:08.656Z   Error: Request for https://www.amazon.com/dp/B07P6Y8L3F aborted due to abortFunction
2020-05-16T05:35:08.657Z     at DuplexWrapper.<anonymous> (/usr/src/app/node_modules/@apify/http-request/src/index.js:167:25)
2020-05-16T05:35:08.658Z     at DuplexWrapper.emit (events.js:311:20)
2020-05-16T05:35:08.659Z     at EventEmitter.<anonymous> (/usr/src/app/node_modules/got/source/as-stream.js:60:9)
2020-05-16T05:35:08.660Z     at EventEmitter.emit (events.js:311:20)
2020-05-16T05:35:08.660Z     at module.exports (/usr/src/app/node_modules/got/source/get-response.js:22:10)
2020-05-16T05:35:08.661Z     at ClientRequest.handleResponse (/usr/src/app/node_modules/got/source/request-as-event-emitter.js:155:5)
2020-05-16T05:35:08.662Z     at Object.onceWrapper (events.js:418:26)
2020-05-16T05:35:08.663Z     at ClientRequest.emit (events.js:323:22)
2020-05-16T05:35:08.663Z     at ClientRequest.origin.emit (/usr/src/app/node_modules/@szmarczak/http-timer/source/index.js:37:11)
2020-05-16T05:35:08.664Z     at HTTPParser.parserOnIncomingClient [as onIncoming] (_http_client.js:603:27)
2020-05-16T05:35:08.664Z     at HTTPParser.parserOnHeadersComplete (_http_common.js:119:17)
2020-05-16T05:35:08.666Z     at Socket.socketOnData (_http_client.js:476:22)
2020-05-16T05:35:08.666Z     at Socket.emit (events.js:311:20)
2020-05-16T05:35:08.667Z     at Socket.Readable.read (_stream_readable.js:512:10)
2020-05-16T05:35:08.667Z     at Socket.read (net.js:618:39)
2020-05-16T05:35:08.668Z     at flow (_stream_readable.js:989:34)
2020-05-16T05:35:08.668Z     at resume_ (_stream_readable.js:970:3)
2020-05-16T05:35:08.669Z     at processTicksAndRejections (internal/process/task_queues.js:84:21)
2020-05-16T05:35:11.922Z INFO: Request https://www.amazon.com/dp/B07P6Y8L3F failed 4 times
2020-05-16T05:35:11.924Z ERROR: BasicCrawler: runTaskFunction error handler threw an exception. This places the crawler and its underlying storages into an unknown state and crawling will be terminated. This may have happened due to an internal error of Apify's API or due to a misconfigured crawler. If you are sure that there is no error in your code, selecting "Restart on error" in the actor's settingswill make sure that the run continues where it left off, if programmed to handle restarts correctly.
2020-05-16T05:35:11.924Z   ReferenceError: $ is not defined
2020-05-16T05:35:11.926Z     at BasicCrawler.handleFailedRequestFunction (/usr/src/app/src/main.js:161:46)
2020-05-16T05:35:11.926Z     at BasicCrawler._requestFunctionErrorHandler (/usr/src/app/node_modules/apify/build/crawlers/basic_crawler.js:475:17)
2020-05-16T05:35:11.927Z     at processTicksAndRejections (internal/process/task_queues.js:97:5)
2020-05-16T05:35:11.928Z     at async BasicCrawler._runTaskFunction (/usr/src/app/node_modules/apify/build/crawlers/basic_crawler.js:413:9)
2020-05-16T05:35:11.928Z     at async AutoscaledPool._maybeRunTask (/usr/src/app/node_modules/apify/build/autoscaling/autoscaled_pool.js:463:7)
2020-05-16T05:35:11.929Z ERROR: AutoscaledPool: runTaskFunction failed.
2020-05-16T05:35:11.930Z   ReferenceError: $ is not defined
2020-05-16T05:35:11.931Z     at BasicCrawler.handleFailedRequestFunction (/usr/src/app/src/main.js:161:46)
2020-05-16T05:35:11.931Z     at BasicCrawler._requestFunctionErrorHandler (/usr/src/app/node_modules/apify/build/crawlers/basic_crawler.js:475:17)
2020-05-16T05:35:11.932Z     at processTicksAndRejections (internal/process/task_queues.js:97:5)
2020-05-16T05:35:11.933Z     at async BasicCrawler._runTaskFunction (/usr/src/app/node_modules/apify/build/crawlers/basic_crawler.js:413:9)
2020-05-16T05:35:11.947Z     at async AutoscaledPool._maybeRunTask (/usr/src/app/node_modules/apify/build/autoscaling/autoscaled_pool.js:463:7)
2020-05-16T05:35:11.948Z INFO: Crawler final request statistics: {"avgDurationMillis":null,"perMinute":0,"finished":0,"failed":1,"retryHistogram":[null,null,null,1]}
2020-05-16T05:35:11.949Z ERROR: The function passed to Apify.main() threw an exception:
2020-05-16T05:35:11.950Z   ReferenceError: $ is not defined
2020-05-16T05:35:11.950Z     at BasicCrawler.handleFailedRequestFunction (/usr/src/app/src/main.js:161:46)
2020-05-16T05:35:11.951Z     at BasicCrawler._requestFunctionErrorHandler (/usr/src/app/node_modules/apify/build/crawlers/basic_crawler.js:475:17)
2020-05-16T05:35:11.952Z     at processTicksAndRejections (internal/process/task_queues.js:97:5)
2020-05-16T05:35:11.952Z     at async BasicCrawler._runTaskFunction (/usr/src/app/node_modules/apify/build/crawlers/basic_crawler.js:413:9)
2020-05-16T05:35:11.953Z     at async AutoscaledPool._maybeRunTask (/usr/src/app/node_modules/apify/build/autoscaling/autoscaled_pool.js:463:7)
2020-05-16T05:35:11.953Z npm ERR! code ELIFECYCLE
2020-05-16T05:35:11.954Z npm ERR! errno 91
2020-05-16T05:35:11.955Z npm ERR! amazon@1.0.0 start: `node ./src/main.js`
2020-05-16T05:35:11.955Z npm ERR! Exit status 91
2020-05-16T05:35:11.956Z npm ERR! 
2020-05-16T05:35:11.956Z npm ERR! Failed at the amazon@1.0.0 start script.
2020-05-16T05:35:11.957Z npm ERR! This is probably not a problem with npm. There is likely additional logging output above.
2020-05-16T05:35:11.957Z 
2020-05-16T05:35:11.958Z npm ERR! A complete log of this run can be found in:
2020-05-16T05:35:11.959Z npm ERR!     /root/.npm/_logs/2020-05-16T05_35_11_942Z-debug.log

Could you help us to give guidance on the issue, is it something to do with the crawler, or it is something to do with our Apify subscription.

I figured, this is something to do with the proxy setting.