Could not convert string to float: '*****'

Question

Could not convert string to float: '*****'

datarebellion opened this issue 6 months ago · comments

Hi, thanks for this package, it has been very useful! I'm running the query below and the ACS data it's retrieving has asterisks in it. I can't convert the asterisks because the query doesn't make it into the data frame. Any tips on how to resolve? Thanks!

# Configure query
query = Query(
    estimate=1,
    years=[2021],
    variables=['B01001_001E',	'B01001_001EA',	'B01001_001M',	'B01001_001MA',	'B01001_002E',	
'B01001_002EA',	'B01001_002M',	'B01001_002MA',	'B01001_003E',	'B01001_003EA',	'B01001_003M',	'B01001_003MA',	
'B01001_004E',	'B01001_004EA',	'B01001_004M',	'B01001_004MA'],
    for_geo='county:*',
    in_geo=['state:*'],
    # Optional arg to add geometry: 'points', 'polygons', or None (default)
    geometry='points',
    # Fill in the following with your actual Census API key
    census_api_key=key
)

# Run query and collect output in dataframe
df = query.run()

Error


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[199], line 17
      4 query = Query(
      5     estimate=1,
      6     years=[2021],
   (...)
     13     census_api_key=key
     14 )
     16 # Run query and collect output in dataframe
---> 17 df = query.run()

File ~/.local/lib/python3.10/site-packages/autocensus/query.py:462, in Query.run(self)
    460         logger.info('Retrieving shapefiles...')
    461         shapefiles.extend(self.get_shapefiles())
--> 462 dataframe = self.assemble_dataframe(variables, tables, gazetteer_files, shapefiles)
    463 return dataframe

File ~/.local/lib/python3.10/site-packages/autocensus/query.py:406, in Query.assemble_dataframe(self, variables, tables, gazetteer_files, shapefiles)
    404 # Merge tables with variables, annotations
    405 logger.info('Merging ACS tables and variables...')
--> 406 tables_dataframe = self.convert_tables_to_dataframe(tables)
    407 variables_dataframe = self.convert_variables_to_dataframe(variables)
    408 dataframe = tables_dataframe.merge(
    409     right=variables_dataframe, how='left', on=['variable', 'year']
    410 )

File ~/.local/lib/python3.10/site-packages/autocensus/query.py:285, in Query.convert_tables_to_dataframe(self, tables)
    279 # Ensure correct sort order and value dtype
    280 dataframe = (
    281     pd.concat(subsets)
    282     .sort_values(by=['geo_type', 'variable', 'NAME', 'year'])
    283     .reset_index(drop=True)
    284 )
--> 285 dataframe['value'] = dataframe['value'].astype(float)
    287 return dataframe

File ~/.local/lib/python3.10/site-packages/pandas/core/generic.py:6534, in NDFrame.astype(self, dtype, copy, errors)
   6530     results = [ser.astype(dtype, copy=copy) for _, ser in self.items()]
   6532 else:
   6533     # else, only a single dtype is given
-> 6534     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6535     res = self._constructor_from_mgr(new_data, axes=new_data.axes)
   6536     return res.__finalize__(self, method="astype")

File ~/.local/lib/python3.10/site-packages/pandas/core/internals/managers.py:414, in BaseBlockManager.astype(self, dtype, copy, errors)
    411 elif using_copy_on_write():
    412     copy = False
--> 414 return self.apply(
    415     "astype",
    416     dtype=dtype,
    417     copy=copy,
    418     errors=errors,
    419     using_cow=using_copy_on_write(),
    420 )

File ~/.local/lib/python3.10/site-packages/pandas/core/internals/managers.py:354, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
    352         applied = b.apply(f, **kwargs)
    353     else:
--> 354         applied = getattr(b, f)(**kwargs)
    355     result_blocks = extend_blocks(applied, result_blocks)
    357 out = type(self).from_blocks(result_blocks, self.axes)

File ~/.local/lib/python3.10/site-packages/pandas/core/internals/blocks.py:616, in Block.astype(self, dtype, copy, errors, using_cow)
    596 """
    597 Coerce to the new dtype.
    598 
   (...)
    612 Block
    613 """
    614 values = self.values
--> 616 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    618 new_values = maybe_coerce_values(new_values)
    620 refs = None

File ~/.local/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:238, in astype_array_safe(values, dtype, copy, errors)
    235     dtype = dtype.numpy_dtype
    237 try:
--> 238     new_values = astype_array(values, dtype, copy=copy)
    239 except (ValueError, TypeError):
    240     # e.g. _astype_nansafe can fail on object-dtype of strings
    241     #  trying to convert to float
    242     if errors == "ignore":

File ~/.local/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:183, in astype_array(values, dtype, copy)
    180     values = values.astype(dtype, copy=copy)
    182 else:
--> 183     values = _astype_nansafe(values, dtype, copy=copy)
    185 # in pandas we don't store numpy str dtypes, so convert to object
    186 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):

File ~/.local/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:134, in _astype_nansafe(arr, dtype, copy, skipna)
    130     raise ValueError(msg)
    132 if copy or arr.dtype == object or dtype == object:
    133     # Explicit copy, or required since NumPy can't view from / to object.
--> 134     return arr.astype(dtype, copy=True)
    136 return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: '*****'

cmsetzer · Answer 1 · Sat Jan 20 2024 05:45:23 GMT+0800 (China Standard Time)

Hi @datarebellion, thanks for reporting this! Looks like a bug in the way autocensus handles Margin of Error Annotation (MA) values. (This page has more on annotation values like *****, if you're curious).

I'll plan to leave this issue open until I can implement a fix in the next release. In the meantime, your query should work as expected if you exclude MA variables such as B01001_001MA.

datarebellion · Answer 2 · Tue Jan 23 2024 01:45:13 GMT+0800 (China Standard Time)

Hi @cmsetzer, thanks for the quick reply! I need those columns, but will exclude them for now and add them back once the data is pulled. Thanks for all your work on this!

cmsetzer · Answer 3 · Thu Feb 01 2024 09:54:52 GMT+0800 (China Standard Time)

I've released version 2.1.3, which includes a fix for this Margin of Error Annotation bug. There may yet be some edge cases this doesn't cover, but it does resolve the issue with the MA variables from your query above, such as B01001_001MA. Upgrade like so:

pip install --upgrade autocensus

Thanks again for reporting the bug!