Ch. 7 cell 57 data does not read
vista888 opened this issue · comments
vista888 commented
When I run movies = pd.read_table('datasets/movielens/movies.dat', sep='::', header=None, names=mnames)
I get the following error:
UnicodeDecodeError Traceback (most recent call last)
/var/folders/31/mhwrpttd4_n27dn_wmgr4vqh0000gn/T/ipykernel_12218/3495340260.py in <module>
1 mnames = ['movie_id', 'title', 'genres']
----> 2 movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
3 header=None, names=mnames)
4 movies[:10]
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/readers.py in read_table(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, encoding_errors, delim_whitespace, low_memory, memory_map, float_precision)
681 kwds.update(kwds_defaults)
682
--> 683 return _read(filepath_or_buffer, kwds)
684
685
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
480
481 # Create the parser.
--> 482 parser = TextFileReader(filepath_or_buffer, **kwds)
483
484 if chunksize or iterator:
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
809 self.options["has_index_names"] = kwds["has_index_names"]
810
--> 811 self._engine = self._make_engine(self.engine)
812
813 def close(self):
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
1038 )
1039 # error: Too many arguments for "ParserBase"
-> 1040 return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
1041
1042 def _failover_to_python(self):
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py in __init__(self, f, **kwds)
110 self.num_original_columns,
111 self.unnamed_cols,
--> 112 ) = self._infer_columns()
113 except (TypeError, ValueError):
114 self.close()
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py in _infer_columns(self)
490 else:
491 try:
--> 492 line = self._buffered_line()
493
494 except StopIteration as err:
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py in _buffered_line(self)
582 return self.buf[0]
583 else:
--> 584 return self._next_line()
585
586 def _check_for_bom(self, first_row):
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py in _next_line(self)
679
680 while True:
--> 681 orig_line = self._next_iter_line(row_num=self.pos + 1)
682 self.pos += 1
683
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py in _next_iter_line(self, row_num)
740 # assert for mypy, data is Iterator[str] or None, would error in next
741 assert self.data is not None
--> 742 return next(self.data)
743 except csv.Error as e:
744 if (
~/.pyenv/versions/3.8.1/envs/tutorials/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py in _read()
226
227 def _read():
--> 228 line = f.readline()
229 pat = re.compile(sep)
230
~/.pyenv/versions/3.8.1/lib/python3.8/codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 3114: invalid continuation byte```
Wes McKinney commented
Try adding encoding='latin-1'
. I will update the file in this repository with a utf-8-encoded version of the data
vista888 commented
Thanks! That worked for me.
Wes McKinney commented
Fixed in e9f3eab