Reduce number of packages fetched
hugovk opened this issue · comments
Similar to #3, the data dump is using up too much of the free quota each week
Tail of the logs:
Fri Nov 8 17:30:01 EET 2019
From github.com:hugovk/top-pypi-packages
* branch master -> FETCH_HEAD
Already up-to-date.
Traceback (most recent call last):
File "/usr/local/bin/pypinfo", line 11, in <module>
sys.exit(pypinfo())
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 722, in __call__
return self.main(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 697, in main
rv = self.invoke(ctx)
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1043, in invoke
return Command.invoke(self, ctx)
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 895, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 535, in invoke
return callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/click/decorators.py", line 17, in new_func
return f(get_current_context(), *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/pypinfo/cli.py", line 150, in pypinfo
query_rows = query_job.result(timeout=timeout // 1000)
File "/usr/local/lib/python3.6/dist-packages/google/cloud/bigquery/job.py", line 1932, in result
super(QueryJob, self).result(timeout=timeout)
File "/usr/local/lib/python3.6/dist-packages/google/cloud/bigquery/job.py", line 528, in result
return super(_AsyncJob, self).result(timeout=timeout)
File "/usr/local/lib/python3.6/dist-packages/google/api_core/future/polling.py", line 106, in result
self._blocking_poll(timeout=timeout)
File "/usr/local/lib/python3.6/dist-packages/google/cloud/bigquery/job.py", line 1906, in _blocking_poll
super(QueryJob, self)._blocking_poll(timeout=timeout)
File "/usr/local/lib/python3.6/dist-packages/google/api_core/future/polling.py", line 85, in _blocking_poll
retry_(self._done_or_raise)()
File "/usr/local/lib/python3.6/dist-packages/google/api_core/retry.py", line 260, in retry_wrapped_func
on_error=on_error,
File "/usr/local/lib/python3.6/dist-packages/google/api_core/retry.py", line 177, in retry_target
return target()
File "/usr/local/lib/python3.6/dist-packages/google/api_core/future/polling.py", line 62, in _done_or_raise
if not self.done():
File "/usr/local/lib/python3.6/dist-packages/google/cloud/bigquery/job.py", line 1894, in done
project=self.project, timeout_ms=timeout_ms)
File "/usr/local/lib/python3.6/dist-packages/google/cloud/bigquery/client.py", line 523, in _get_query_results
retry, method='GET', path=path, query_params=extra_params)
File "/usr/local/lib/python3.6/dist-packages/google/cloud/bigquery/client.py", line 275, in _call_api
return call()
File "/usr/local/lib/python3.6/dist-packages/google/api_core/retry.py", line 260, in retry_wrapped_func
on_error=on_error,
File "/usr/local/lib/python3.6/dist-packages/google/api_core/retry.py", line 177, in retry_target
return target()
File "/usr/local/lib/python3.6/dist-packages/google/cloud/_http.py", line 293, in api_request
raise exceptions.from_http_response(response)
google.api_core.exceptions.Forbidden: 403 GET https://www.googleapis.com/bigquery/v2/projects/top-pypi-packages/queries/cd819ea5-5d5f-429a-9f89-5cf6ed65f288?maxResults=0&timeoutMs=10000: Quota exceeded: Your project exceeded quota for free query bytes scanned. For more information, see https://cloud.google.com/bigquery/troubleshooting-errors
Checking some of the quota used for recent commits, only looking at the 30-day file, shows it increasing quite a lot (most recent first):
"last_update": "2019-10-25 14:30:16",
"query": {
"bytes_billed": 64801996800,
"bytes_processed": 64801954761,
"cached": false,
"estimated_cost": "0.30"
"last_update": "2019-10-11 14:30:19",
"query": {
"bytes_billed": 63343427584,
"bytes_processed": 63342974297,
"cached": false,
"estimated_cost": "0.29"
"last_update": "2019-09-27 14:30:18",
"query": {
"bytes_billed": 60923314176,
"bytes_processed": 60922288651,
"cached": false,
"estimated_cost": "0.28"
"last_update": "2019-09-13 14:30:19",
"query": {
"bytes_billed": 57708380160,
"bytes_processed": 57707509354,
"cached": false,
"estimated_cost": "0.27"
~1 year back:
"last_update": "2018-12-09 15:29:18",
"query": {
"bytes_billed": 34852569088,
"bytes_processed": 34852500801,
"cached": false,
"estimated_cost": "0.16"
It's already been reduced from weekly to fortnightly (#5), let's now reduce the number of packages fetched. Let's first try from 5,000 to 4,000.
The current download counts for packages in positions:
1. 799,281,348
4,000: 150,934
5,000: 91,375