Timeout error during execution of mutation when no of async tasks is above 5000 +
cgoma opened this issue · comments
Describe the problem
When number of tasks which executes mutation goes beyond 5000+ , timeout error occurred for some of the tasks.
When the task number is less i.e. around 300, all mutation executed successfully.
For 5000 tasks as well, it succeeded few times but for most of the time it failed.
I want to make sure all the tasks (5000+) should be successful. Not sure why it worked some time and fails most of the time.
The code which I am using is as below. How I can make sure all the 5000 mutation should be executed successfully without fail
#!/usr/bin/python3
# __author__ = "cgoma"
# __version__ = "2024.02.09"
import sys
import os
import time
import json
import logging
import asyncio
import backoff
from gql import gql,Client
from gql.transport.aiohttp import AIOHTTPTransport
from gql.transport.requests import RequestsHTTPTransport
from gql.transport.exceptions import TransportQueryError, \
TransportProtocolError, TransportServerError,TransportClosed, \
TransportAlreadyConnected
from concurrent.futures import ThreadPoolExecutor, wait
from gql.transport.httpx import HTTPXAsyncTransport
module_log = logging.getLogger(__name__)
def get_task(session,service_bus_details):
tasks = []
time_to_read_data = 0
for item in service_bus_details:
channel_list = getChannel()
for signal in channel_list:
start = time.time()
ch = cg.getChannel()
end = time.time()
total_time = end - start
time_to_read_data += total_time
try:
mutation = gql(
"""
mutation CreateProject($objects: signal_insert_input!)
{
insert_signal_one(object: $objects)
{
name
signal
iteration
}
}
"""
)
params = {'objects': {'name':signal+ '.' + str(number),'signal':ch.data.tolist(),'iteration':str(number)}}
tasks.append(asyncio.create_task(session.execute(mutation,variable_values=params,get_execution_result=True)))
except TransportQueryError as E:
module_log.warning(f"Wrong query {E}")
except Exception as err:
module_log.warning(f"The exception {err}:")
return tasks,time_to_read_data
async def main():
module_log.warning('Inside new main function')
try:
# read env variables
file_share_root = os.getenv("FILE_SHARE_ROOT")
#Service bus
#Read a message from service bus
service_bus_details = await asyncio.gather(get_service_bus_details())
module_log.warning(f"Service bus details JSON:\n{json.dumps(service_bus_details, indent=4)}")
tasks = []
results = []
data_service_secret = os.getenv("ADMIN_SECRET")
data_service_endpoint = os.getenv("DATA_SERVICE_ENDPOINT")
#data service endpoint
headers={'x-hasura-admin-secret': data_service_secret}
headers['Content-Type'] = 'application/json'
transport = AIOHTTPTransport(url=data_service_endpoint,headers=headers)
client = Client(transport=transport,fetch_schema_from_transport=True,execute_timeout=None)
retry_connect = backoff.on_exception(
backoff.expo,
Exception,
max_value=10,
jitter=None,
)
# Here Only 3 tries for execute calls
retry_execute = backoff.on_exception(
backoff.expo,
Exception,
max_tries=3,
giveup=lambda e: isinstance(e, TransportQueryError),
)
#here create persistent session also try to execute query when failed due to transport error
session = await client.connect_async(reconnecting=True, retry_connect=retry_connect,retry_execute=retry_execute )
tasks,read_time = get_task(session,service_bus_details[0])
module_log.warning(f"The number of tasks send to event loop are {len(tasks)}")
#creating sublist of 10 tasks and trying to send execute mutation for 10 tasks
n = 10
chunk_list = [tasks[i:i+n] for i in range(0, len(tasks), n)]
start_time = time.time()
for task in chunk_list:
responses = await asyncio.gather(*task,return_exceptions=False)
end_time = time.time()
insert_time = end_time - start_time
await client.close_async()
except TransportQueryError as err:
module_log.warning(f"TransportQueryError:{err}")
except TransportProtocolError as err:
module_log.warning(f"TransportProtocolError: {err}")
except TransportServerError as err:
module_log.warning(f"TransportServerError: {err}")
except TransportClosed as err:
module_log.warning(f"TransportClosed: {err}")
except TransportAlreadyConnected as err:
module_log.warning(f"TransportAlreadyConnected: {err}")
except asyncio.TimeoutError as err:
module_log.warning(f"AsyncTimeoutError: {err}")
except asyncio.CancelledError as err:
module_log.warning(f"ACancelledError: {err}")
except TimeoutError as err:
module_log.warning(f"TimeoutError: {err}")
except Exception as err:
module_log.warning(f"Exception while reading signal name and data:{err}")
asyncio.run(main())
Below are the exceptions I am getting
Traceback (most recent call last):
File "/app/./app.py", line 182, in <module>
asyncio.run(main())
File "/usr/local/lib/python3.11/asyncio/runners.py", line 190, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/asyncio/base_events.py", line 650, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "/app/./app.py", line 155, in main
responses = await asyncio.gather(*task,return_exceptions=False)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/gql/client.py", line 1628, in execute
result = await self._execute(
^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/gql/client.py", line 1816, in _execute
return await self._execute_with_retries(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/backoff/_async.py", line 151, in retry
ret = await target(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/gql/client.py", line 1789, in _execute_once
answer = await super()._execute(
^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/gql/client.py", line 1537, in _execute
result = await self.transport.execute(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/gql/transport/aiohttp.py", line 308, in execute
async with self.session.post(self.url, ssl=self.ssl, **post_args) as resp:
File "/usr/local/lib/python3.11/site-packages/aiohttp/client.py", line 1194, in __aenter__
self._resp = await self._coro
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/aiohttp/client.py", line 504, in _request
with timer:
File "/usr/local/lib/python3.11/site-packages/aiohttp/helpers.py", line 735, in __exit__
raise asyncio.TimeoutError from None
TimeoutError
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fee50215f50>
Task exception was never retrieved
future: <Task finished name='Task-1437' coro=<AsyncClientSession.execute() done, defined at /usr/local/lib/python3.11/site-packages/gql/client.py:1598> exception=TimeoutError()>
You need to understand that when you use asyncio.create_task
, that task is started immediately, and asyncio.gather
is only used to wait for a specific task.
So you started 5000 tasks at the same time and in your for task in chunk_list
loop you're waiting for each of those tasks in sequence, but they all already started.
So you're running into a timeout which is implemented in the aiohttp library.
By default there is a 5 minutes timeout. See https://docs.aiohttp.org/en/stable/client_quickstart.html#timeouts
You could probably modify that timeout by passing a custom aiohttp.ClientTimeout
object to the timeout
key of the client_session_args
parameter of AIOHTTPTransport
, but a better way to fix this would be just to reorganize your code to maybe have a pool of 10 asyncio tasks waiting for new jobs to do from a asyncio queue, or at least create your tasks 10 by 10 instead of creating them all at once.