[BUG]: `pairwise_linestring_intersection` returns invalid offsets buffer
thomcom opened this issue · comments
Version
23.04
On which installation method(s) does this occur?
Rapids-Compose
Describe the issue
While building benchmarks for binary predicates, I discovered an issue with pairwise_linestring_intersection
that is hard to duplicate. The types_buffer
and offset_buffer
returned by c_pairwise_linestring_intersection
don't quite match up, resulting in a corrupt GeoSeries
for the second value returned by pairwise_linestring_intersection
.
Note the following code requires that intersection.py
be modified with the following patch to support sparse GeoSeries
:
geoms, look_back_ids = c_pairwise_linestring_intersection(
- linestrings1._column.lines._column, linestrings2._column.lines._column
+ linestrings1.lines.column(), linestrings2.lines.column()
)
Minimum reproducible example
import cupy as cp
import cuspatial
from cuspatial.tests.binpreds.binpred_test_dispatch import (
features,
linestring_linestring_dispatch_list,
)
from cuspatial.core.binops.intersection import pairwise_linestring_intersection
cp.random.seed(0)
def sample_test_data(features, dispatch_list, size):
"""Create a sparse `GeoSeries` sampled from the specified dispatch list:
In this example, the lhs and rhs are loaded from the set of those features
specified in `linestring_linestring_dispatch_list`, then rearranged into a
`GeoSeries` of each left and rhs.
"""
# Load the features for linestring lhs and linestring rhs
geometry_tuples = [features[key][1:3] for key in dispatch_list]
# Rearrange to make constructable `GeoSeries` from the list.
geometries = [
[lhs_geo for lhs_geo, _ in geometry_tuples],
[rhs_geo for _, rhs_geo in geometry_tuples]
]
lhs = cuspatial.GeoSeries(list(geometries[0]))
rhs = cuspatial.GeoSeries(list(geometries[1]))
# Randomly sample the lhs and rhs size times.
lhs_picks = cp.random.randint(0, len(lhs), size)
rhs_picks = cp.random.randint(0, len(rhs), size)
return (
lhs[lhs_picks].reset_index(drop=True),
rhs[rhs_picks].reset_index(drop=True)
)
lhs, rhs = sample_test_data(features, linestring_linestring_dispatch_list, 311)
offset, geoms, ids = pairwise_linestring_intersection(
lhs, rhs
)
print(geoms._column._meta.input_types._column)
print(geoms._column._meta.union_offsets._column)
print(geoms)
Relevant log output
<cudf.core.column.numerical.NumericalColumn object at 0x7f94108c52c0>
[
2,
0,
0,
0,
0,
0,
0,
0,
0,
0,
...
0,
0,
0,
0,
2,
0,
2,
0,
0,
0
]
dtype: int8
<cudf.core.column.numerical.NumericalColumn object at 0x7f94108c5fc0>
[
0,
0,
1,
2,
3,
4,
5,
6,
7,
8,
...
192,
193,
194,
195,
55,
196,
199,
197,
198,
199
]
dtype: int32
---------------------------------------------------------------------------
ArrowIndexError Traceback (most recent call last)
Cell In[2], line 33
30 print(geoms._column._meta.input_types._column)
31 print(geoms._column._meta.union_offsets._column)
---> 33 print(geoms)
File ~/.conda/envs/rapids/lib/python3.10/site-packages/cudf/core/frame.py:2598, in Frame.__str__(self)
2597 def __str__(self):
-> 2598 return self.to_string()
File ~/.conda/envs/rapids/lib/python3.10/site-packages/nvtx/nvtx.py:101, in annotate.__call__.<locals>.inner(*args, **kwargs)
98 @wraps(func)
99 def inner(*args, **kwargs):
100 libnvtx_push_range(self.attributes, self.domain.handle)
--> 101 result = func(*args, **kwargs)
102 libnvtx_pop_range(self.domain.handle)
103 return result
File ~/.conda/envs/rapids/lib/python3.10/site-packages/cudf/core/frame.py:2595, in Frame.to_string(self)
2574 @_cudf_nvtx_annotate
2575 def to_string(self):
2576 r"""
2577 Convert to string
2578
(...)
2593 ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0'
2594 """
-> 2595 return repr(self)
File ~/cuspatial/python/cuspatial/cuspatial/core/geoseries.py:357, in GeoSeries.__repr__(self)
355 def __repr__(self):
356 # TODO: Implement Iloc with slices so that we can use `Series.__repr__`
--> 357 return self.to_pandas().__repr__()
File ~/cuspatial/python/cuspatial/cuspatial/core/geoseries.py:468, in GeoSeries.to_pandas(self)
464 def to_pandas(self):
465 """Treats to_pandas and to_geopandas as the same call, which improves
466 compatibility with pandas.
467 """
--> 468 return self.to_geopandas()
File ~/cuspatial/python/cuspatial/cuspatial/core/geoseries.py:459, in GeoSeries.to_geopandas(self, nullable)
456 raise ValueError("GeoSeries doesn't support <NA> yet")
457 final_union_slice = self.iloc[0 : len(self._column)]
458 return gpGeoSeries(
--> 459 final_union_slice.to_shapely(),
460 index=self.index.to_pandas(),
461 name=self.name,
462 )
File ~/cuspatial/python/cuspatial/cuspatial/core/geoseries.py:525, in GeoSeries.to_shapely(self)
520 results = []
521 for (result_index, shapely_serialization_fn) in zip(
522 range(0, len(self)), shapely_fns
523 ):
524 results.append(
--> 525 shapely_serialization_fn(union[result_index].as_py())
526 )
528 # Finally, a slice determines that we return a list, otherwise
529 # an object.
530 if len(results) == 1:
File ~/.conda/envs/rapids/lib/python3.10/site-packages/pyarrow/array.pxi:1295, in pyarrow.lib.Array.__getitem__()
File ~/.conda/envs/rapids/lib/python3.10/site-packages/pyarrow/array.pxi:1298, in pyarrow.lib.Array.getitem()
File ~/.conda/envs/rapids/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File ~/.conda/envs/rapids/lib/python3.10/site-packages/pyarrow/error.pxi:127, in pyarrow.lib.check_status()
ArrowIndexError: index with value of 199 is out-of-bounds for array of length 57
Environment details
No response
Other/Misc.
Note in particular the last few values of the input_types
column:
0,
2,
0,
2,
0,
0,
0
]
compared with those matching values from union_offsets
:
195,
55,
196,
199,
197,
198,
199
]
It is clear that the union_offset
value for the 2nd linestring is incorrect and should be 56
, not 199
.