Could not convert string to float: '*****'
datarebellion opened this issue · comments
Hi, thanks for this package, it has been very useful! I'm running the query below and the ACS data it's retrieving has asterisks in it. I can't convert the asterisks because the query doesn't make it into the data frame. Any tips on how to resolve? Thanks!
# Configure query
query = Query(
estimate=1,
years=[2021],
variables=['B01001_001E', 'B01001_001EA', 'B01001_001M', 'B01001_001MA', 'B01001_002E',
'B01001_002EA', 'B01001_002M', 'B01001_002MA', 'B01001_003E', 'B01001_003EA', 'B01001_003M', 'B01001_003MA',
'B01001_004E', 'B01001_004EA', 'B01001_004M', 'B01001_004MA'],
for_geo='county:*',
in_geo=['state:*'],
# Optional arg to add geometry: 'points', 'polygons', or None (default)
geometry='points',
# Fill in the following with your actual Census API key
census_api_key=key
)
# Run query and collect output in dataframe
df = query.run()
Error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[199], line 17
4 query = Query(
5 estimate=1,
6 years=[2021],
(...)
13 census_api_key=key
14 )
16 # Run query and collect output in dataframe
---> 17 df = query.run()
File ~/.local/lib/python3.10/site-packages/autocensus/query.py:462, in Query.run(self)
460 logger.info('Retrieving shapefiles...')
461 shapefiles.extend(self.get_shapefiles())
--> 462 dataframe = self.assemble_dataframe(variables, tables, gazetteer_files, shapefiles)
463 return dataframe
File ~/.local/lib/python3.10/site-packages/autocensus/query.py:406, in Query.assemble_dataframe(self, variables, tables, gazetteer_files, shapefiles)
404 # Merge tables with variables, annotations
405 logger.info('Merging ACS tables and variables...')
--> 406 tables_dataframe = self.convert_tables_to_dataframe(tables)
407 variables_dataframe = self.convert_variables_to_dataframe(variables)
408 dataframe = tables_dataframe.merge(
409 right=variables_dataframe, how='left', on=['variable', 'year']
410 )
File ~/.local/lib/python3.10/site-packages/autocensus/query.py:285, in Query.convert_tables_to_dataframe(self, tables)
279 # Ensure correct sort order and value dtype
280 dataframe = (
281 pd.concat(subsets)
282 .sort_values(by=['geo_type', 'variable', 'NAME', 'year'])
283 .reset_index(drop=True)
284 )
--> 285 dataframe['value'] = dataframe['value'].astype(float)
287 return dataframe
File ~/.local/lib/python3.10/site-packages/pandas/core/generic.py:6534, in NDFrame.astype(self, dtype, copy, errors)
6530 results = [ser.astype(dtype, copy=copy) for _, ser in self.items()]
6532 else:
6533 # else, only a single dtype is given
-> 6534 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
6535 res = self._constructor_from_mgr(new_data, axes=new_data.axes)
6536 return res.__finalize__(self, method="astype")
File ~/.local/lib/python3.10/site-packages/pandas/core/internals/managers.py:414, in BaseBlockManager.astype(self, dtype, copy, errors)
411 elif using_copy_on_write():
412 copy = False
--> 414 return self.apply(
415 "astype",
416 dtype=dtype,
417 copy=copy,
418 errors=errors,
419 using_cow=using_copy_on_write(),
420 )
File ~/.local/lib/python3.10/site-packages/pandas/core/internals/managers.py:354, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
352 applied = b.apply(f, **kwargs)
353 else:
--> 354 applied = getattr(b, f)(**kwargs)
355 result_blocks = extend_blocks(applied, result_blocks)
357 out = type(self).from_blocks(result_blocks, self.axes)
File ~/.local/lib/python3.10/site-packages/pandas/core/internals/blocks.py:616, in Block.astype(self, dtype, copy, errors, using_cow)
596 """
597 Coerce to the new dtype.
598
(...)
612 Block
613 """
614 values = self.values
--> 616 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
618 new_values = maybe_coerce_values(new_values)
620 refs = None
File ~/.local/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:238, in astype_array_safe(values, dtype, copy, errors)
235 dtype = dtype.numpy_dtype
237 try:
--> 238 new_values = astype_array(values, dtype, copy=copy)
239 except (ValueError, TypeError):
240 # e.g. _astype_nansafe can fail on object-dtype of strings
241 # trying to convert to float
242 if errors == "ignore":
File ~/.local/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:183, in astype_array(values, dtype, copy)
180 values = values.astype(dtype, copy=copy)
182 else:
--> 183 values = _astype_nansafe(values, dtype, copy=copy)
185 # in pandas we don't store numpy str dtypes, so convert to object
186 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
File ~/.local/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:134, in _astype_nansafe(arr, dtype, copy, skipna)
130 raise ValueError(msg)
132 if copy or arr.dtype == object or dtype == object:
133 # Explicit copy, or required since NumPy can't view from / to object.
--> 134 return arr.astype(dtype, copy=True)
136 return arr.astype(dtype, copy=copy)
ValueError: could not convert string to float: '*****'
Hi @datarebellion, thanks for reporting this! Looks like a bug in the way autocensus handles Margin of Error Annotation (MA) values. (This page has more on annotation values like *****
, if you're curious).
I'll plan to leave this issue open until I can implement a fix in the next release. In the meantime, your query should work as expected if you exclude MA
variables such as B01001_001MA
.
Hi @cmsetzer, thanks for the quick reply! I need those columns, but will exclude them for now and add them back once the data is pulled. Thanks for all your work on this!
I've released version 2.1.3, which includes a fix for this Margin of Error Annotation bug. There may yet be some edge cases this doesn't cover, but it does resolve the issue with the MA
variables from your query above, such as B01001_001MA
. Upgrade like so:
pip install --upgrade autocensus
Thanks again for reporting the bug!