Better way of finding successful dag duration
ayush-san opened this issue · comments
In the last query of get_dag_duration_info()
, why are we filtering on the basis of TaskInstance? This results in an explosion of the final result with duplicate rows.
dag_duration = get_dag_duration_info()
len(dag_duration) # 960640
set_dag_duration = set()
for dag in dag_duration:
set_dag_duration.add((dag.dag_id, dag.start_date, dag.end_date))
len(set_dag_duration) # 38
Instead, we should be adding the filter condition in dag_start_dt_query
subquery and remove it from the last query.
dag_start_dt_query = (
session.query(
max_execution_dt_query.c.dag_id,
max_execution_dt_query.c.max_execution_dt.label(
"execution_date"
),
func.min(TaskInstance.start_date).label("start_date"),
)
.join(
TaskInstance,
and_(
TaskInstance.dag_id == max_execution_dt_query.c.dag_id,
(
TaskInstance.execution_date
== max_execution_dt_query.c.max_execution_dt
),
),
)
.filter(
TaskInstance.start_date.isnot(None),
TaskInstance.end_date.isnot(None),
)
.group_by(
max_execution_dt_query.c.dag_id,
max_execution_dt_query.c.max_execution_dt,
)
.subquery()
)
return (
session.query(
dag_start_dt_query.c.dag_id,
dag_start_dt_query.c.start_date,
DagRun.end_date,
)
.join(
DagRun,
and_(
DagRun.dag_id == dag_start_dt_query.c.dag_id,
DagRun.execution_date
== dag_start_dt_query.c.execution_date,
),
)
.all()
)
The fix is a must if you'd like to use this Airflow plugin, otherwise, you can just fork yours.