viveksck / langchangetrack

Package for Statistically significant linguistic change

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Error on Mac OSX 10.10

lennybronner opened this issue · comments

Before I ran this, I commented out all lines that read:

p.cpu_affinity(list(range(cpu_count())))

Because I was running into issues with it and it is anyway only supported on Unix, FreeBSD and Windows.

I run the ngrams_pipeline.py from the examples directory as seen in sample usage.

However, now I am running into the following issue:
After getting through all the chunks, I see this message for every word:
"Failed to interpolate" + word
The code then crashes with the following error, which I am unable to fix.

Config:
Input data frame file name: ./working/timeseries/source.csv
Vocab file ../examples/data/temporal_corpus/common_vocab.txt
Output pvalue file ./output/pvals.csv
Output sample file ./output/samples.csv
Columns to drop 1900
Normalize Time series: True
Threshold 1.75
Dropped column 1900
Columns of the data frame are Index([u'Unnamed: 0', u'word', u'1905', u'1910', u'1915', u'1920', u'1925',
u'1930', u'1935', u'1940', u'1945', u'1950', u'1955', u'1960', u'1965',
u'1970', u'1975', u'1980', u'1985', u'1990', u'1995'],
dtype='object')
Number of words we are analyzing: 3790
Traceback (most recent call last):
File "/usr/local/bin/detect_changepoints_word_ts.py", line 5, in
pkg_resources.run_script('langchangetrack==0.1.0', 'detect_changepoints_word_ts.py')
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources.py", line 492, in run_script
self.require(requires)[0].run_script(script_name, ns)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources.py", line 1350, in run_script
execfile(script_filename, namespace, namespace)
File "/Library/Python/2.7/site-packages/langchangetrack-0.1.0-py2.7.egg/EGG-INFO/scripts/detect_changepoints_word_ts.py", line 224, in
main(args)
File "/Library/Python/2.7/site-packages/langchangetrack-0.1.0-py2.7.egg/EGG-INFO/scripts/detect_changepoints_word_ts.py", line 184, in main
results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B)
File "/Library/Python/2.7/site-packages/changepoint/utils/ts_stats.py", line 28, in parallelize_func
delayed(chunker)(chunk, _args, *_kwargs) for chunk in chunks)
File "/Library/Python/2.7/site-packages/joblib/parallel.py", line 812, in call
self.retrieve()
File "/Library/Python/2.7/site-packages/joblib/parallel.py", line 762, in retrieve
raise exception
joblib.my_exceptions.JoblibAttributeError: JoblibAttributeError


Multiprocessing exception:
...........................................................................
/usr/local/bin/detect_changepoints_word_ts.py in ()
1
2 from argparse import ArgumentParser
3 import logging
4 import pandas as pd
----> 5 import numpy as np
6 import itertools
7 import more_itertools
8 import os
9
10 from functools import partial
11 from changepoint.mean_shift_model import MeanShiftModel

...........................................................................
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources.py in run_script(self=<pkg_resources.WorkingSet object>, requires='langchangetrack==0.1.0', script_name='detect_changepoints_word_ts.py')
487 """Locate distribution for requires and run script_name script"""
488 ns = sys._getframe(1).f_globals
489 name = ns['name']
490 ns.clear()
491 ns['name'] = name
--> 492 self.require(requires)[0].run_script(script_name, ns)
self.require = <bound method WorkingSet.require of <pkg_resources.WorkingSet object>>
requires.run_script = undefined
script_name = 'detect_changepoints_word_ts.py'
ns = {'ArgumentParser': <class 'argparse.ArgumentParser'>, 'LOGFORMAT': '%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s', 'MeanShiftModel': <class 'changepoint.mean_shift_model.MeanShiftModel'>, 'TS_OFFSET': 2, 'author': 'Vivek Kulkarni', 'builtins': {'ArithmeticError': <type 'exceptions.ArithmeticError'>, 'AssertionError': <type 'exceptions.AssertionError'>, 'AttributeError': <type 'exceptions.AttributeError'>, 'BaseException': <type 'exceptions.BaseException'>, 'BufferError': <type 'exceptions.BufferError'>, 'BytesWarning': <type 'exceptions.BytesWarning'>, 'DeprecationWarning': <type 'exceptions.DeprecationWarning'>, 'EOFError': <type 'exceptions.EOFError'>, 'Ellipsis': Ellipsis, 'EnvironmentError': <type 'exceptions.EnvironmentError'>, ...}, 'email': 'viveksck@gmail.com', 'file': '/Library/Python/2.7/site-packages/langchangetrac...g/EGG-INFO/scripts/detect_changepoints_word_ts.py', 'name': 'main', 'package': None, ...}
493
494 def iter(self):
495 """Yield distributions for non-duplicate projects in the working set
496

...........................................................................
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources.py in run_script(self=<pkg_resources.PathMetadata instance>, script_name='detect_changepoints_word_ts.py', namespace={'ArgumentParser': <class 'argparse.ArgumentParser'>, 'LOGFORMAT': '%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s', 'MeanShiftModel': <class 'changepoint.mean_shift_model.MeanShiftModel'>, 'TS_OFFSET': 2, 'author': 'Vivek Kulkarni', 'builtins': {'ArithmeticError': <type 'exceptions.ArithmeticError'>, 'AssertionError': <type 'exceptions.AssertionError'>, 'AttributeError': <type 'exceptions.AttributeError'>, 'BaseException': <type 'exceptions.BaseException'>, 'BufferError': <type 'exceptions.BufferError'>, 'BytesWarning': <type 'exceptions.BytesWarning'>, 'DeprecationWarning': <type 'exceptions.DeprecationWarning'>, 'EOFError': <type 'exceptions.EOFError'>, 'Ellipsis': Ellipsis, 'EnvironmentError': <type 'exceptions.EnvironmentError'>, ...}, 'email': 'viveksck@gmail.com', 'file': '/Library/Python/2.7/site-packages/langchangetrac...g/EGG-INFO/scripts/detect_changepoints_word_ts.py', 'name': 'main', 'package': None, ...})
1345 script_text = self.get_metadata(script).replace('\r\n','\n')
1346 script_text = script_text.replace('\r','\n')
1347 script_filename = self._fn(self.egg_info,script)
1348 namespace['file'] = script_filename
1349 if os.path.exists(script_filename):
-> 1350 execfile(script_filename, namespace, namespace)
script_filename = '/Library/Python/2.7/site-packages/langchangetrac...g/EGG-INFO/scripts/detect_changepoints_word_ts.py'
namespace = {'ArgumentParser': <class 'argparse.ArgumentParser'>, 'LOGFORMAT': '%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s', 'MeanShiftModel': <class 'changepoint.mean_shift_model.MeanShiftModel'>, 'TS_OFFSET': 2, 'author': 'Vivek Kulkarni', 'builtins': {'ArithmeticError': <type 'exceptions.ArithmeticError'>, 'AssertionError': <type 'exceptions.AssertionError'>, 'AttributeError': <type 'exceptions.AttributeError'>, 'BaseException': <type 'exceptions.BaseException'>, 'BufferError': <type 'exceptions.BufferError'>, 'BytesWarning': <type 'exceptions.BytesWarning'>, 'DeprecationWarning': <type 'exceptions.DeprecationWarning'>, 'EOFError': <type 'exceptions.EOFError'>, 'Ellipsis': Ellipsis, 'EnvironmentError': <type 'exceptions.EnvironmentError'>, ...}, 'email': 'viveksck@gmail.com', 'file': '/Library/Python/2.7/site-packages/langchangetrac...g/EGG-INFO/scripts/detect_changepoints_word_ts.py', 'name': 'main', 'package': None, ...}
1351 else:
1352 from linecache import cache
1353 cache[script_filename] = (
1354 len(script_text), 0, script_text.split('\n'), script_filename

...........................................................................
/Library/Python/2.7/site-packages/langchangetrack-0.1.0-py2.7.egg/EGG-INFO/scripts/detect_changepoints_word_ts.py in ()
219 args = parser.parse_args()
220 if args.log == 'DEBUG':
221 sys.excepthook = debug
222 numeric_level = getattr(logging, args.log.upper(), None)
223 logging.basicConfig(level=numeric_level, format=LOGFORMAT)
--> 224 main(args)
225
226
227
228

...........................................................................
/Library/Python/2.7/site-packages/langchangetrack-0.1.0-py2.7.egg/EGG-INFO/scripts/detect_changepoints_word_ts.py in main(args=Namespace(B=1000, col='1900', dont_normalize=Fal...ta/temporal_corpus/common_vocab.txt', workers=16))
179 print "Columns of the data frame are", norm_df.columns
180 cwords = norm_df.word.values
181 print "Number of words we are analyzing:", len(cwords)
182
183 chunksz = np.ceil(len(cwords) / float(workers))
--> 184 results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B)
results = undefined
cwords = array(['limited', 'pardon', 'child', ..., 'searched', 'sleeves', 'defend'], dtype=object)
chunksz = 237.0
workers = 16
df = Unnamed: 0 word 1900 ...9 0.001638 0.001554

[3790 rows x 22 columns]
norm_df = Unnamed: 0 word 1905 1...2784
3789 -0.177008

[3790 rows x 21 columns]
args.B = 1000
185
186 pvals, num_samples = zip(*results)
187
188 header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1])

...........................................................................
/Library/Python/2.7/site-packages/changepoint/utils/ts_stats.py in parallelize_func(iterable=array(['limited', 'pardon', 'child', ..., 'searched', 'sleeves', 'defend'], dtype=object), func=, chunksz=237.0, n_jobs=16, _args=(), *_kwargs={'B': 1000, 'df': Unnamed: 0 word 1905 1...2784
3789 -0.177008

[3790 rows x 21 columns]})
23 def parallelize_func(iterable, func, chunksz=1, n_jobs=16, _args, *_kwargs):
24 """ Parallelize a function over each element of an iterable. """
25 chunker = func
26 chunks = more_itertools.chunked(iterable, chunksz)
27 chunks_results = Parallel(n_jobs=n_jobs, verbose=50)(
---> 28 delayed(chunker)(chunk, _args, *_kwargs) for chunk in chunks)
args = ()
kwargs = {'B': 1000, 'df': Unnamed: 0 word 1905 1...2784
3789 -0.177008

[3790 rows x 21 columns]}
chunks =
29 results = more_itertools.flatten(chunks_results)
30 return list(results)
31
32 # Code taken from: http://nbviewer.ipython.org/github/pv/SciPy-CookBook/blob/master/ipython/SignalSmooth.ipynb

...........................................................................
/Library/Python/2.7/site-packages/joblib/parallel.py in call(self=Parallel(n_jobs=16), iterable=<generator object >)
807 if pre_dispatch == "all" or n_jobs == 1:
808 # The iterable was consumed all at once by the above for loop.
809 # No need to wait for async callbacks to trigger to
810 # consumption.
811 self._iterating = False
--> 812 self.retrieve()
self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=16)>
813 # Make sure that we get a last message telling us we are done
814 elapsed_time = time.time() - self._start_time
815 self._print('Done %3i out of %3i | elapsed: %s finished',
816 (len(self._output), len(self._output),


Sub-process traceback:

AttributeError Wed Dec 30 16:34:06 2015
PID: 8524 Python 2.7.10: /usr/bin/python
...........................................................................
/Library/Python/2.7/site-packages/joblib/parallel.pyc in call(self=<joblib.parallel.BatchedCalls object>)
67 def init(self, iterator_slice):
68 self.items = list(iterator_slice)
69 self._size = len(self.items)
70
71 def call(self):
---> 72 return [func(_args, *_kwargs) for func, args, kwargs in self.items]
73
74 def len(self):
75 return self._size
76

...........................................................................
/Library/Python/2.7/site-packages/langchangetrack-0.1.0-py2.7.egg/EGG-INFO/scripts/detect_changepoints_word_ts.py in get_pval_word_chunk(chunk=['limited', 'pardon', 'child', 'knelt', 'yellow', 'four', 'hath', 'protest', 'woods', 'sleep', 'hanging', 'appetite', 'oldest', 'saved', 'forget', 'whose', 'contained', 'trousers', 'lord', 'sorry', ...], df= Unnamed: 0 word 1905 1...2784
3789 -0.177008

[3790 rows x 21 columns], B=1000)
77 return L, H
78
79
80 def get_pval_word_chunk(chunk, df, B):
81 """ Get the p-values for each time point for a chunk of words. """
---> 82 results = [get_pval_word(df, w, B) for w in chunk]
results = undefined
df = Unnamed: 0 word 1905 1...2784
3789 -0.177008

[3790 rows x 21 columns]
w = 'limited'
B = 1000
chunk = ['limited', 'pardon', 'child', 'knelt', 'yellow', 'four', 'hath', 'protest', 'woods', 'sleep', 'hanging', 'appetite', 'oldest', 'saved', 'forget', 'whose', 'contained', 'trousers', 'lord', 'sorry', ...]
83 return results
84
85
86 def get_minpval_cp(pvalue_df_row):

...........................................................................
/Library/Python/2.7/site-packages/langchangetrack-0.1.0-py2.7.egg/EGG-INFO/scripts/detect_changepoints_word_ts.py in get_pval_word(df= Unnamed: 0 word 1905 1...2784
3789 -0.177008

[3790 rows x 21 columns], word='limited', B=1000)
66 # time series for that word.
67 ts = df[df.word == word].values[0][TS_OFFSET:]
68 # Create a mean shift model
69 model = MeanShiftModel()
70 # Detect the change points using a mean shift model
---> 71 stats_ts, pvals, nums = model.detect_mean_shift(ts, B=B)
stats_ts = undefined
pvals = undefined
nums = undefined
model.detect_mean_shift = >
ts = array([0.0006972276343629123, -0.336397977492424...8727898, -0.3431935290462993, nan], dtype=object)
B = 1000
72 # Return the word and pvals associated with each time point.
73 L = [word]
74 L.extend(pvals)
75 H = [word]

...........................................................................
/Library/Python/2.7/site-packages/changepoint/mean_shift_model.pyc in detect_mean_shift(self=<changepoint.mean_shift_model.MeanShiftModel object>, ts=array([0.0006972276343629123, -0.336397977492424...8727898, -0.3431935290462993, nan], dtype=object), B=1000)
69 samples to draw.
70 """
71 x = np.arange(0, len(ts))
72 stat_ts_func = self.compute_balance_mean_ts
73 null_ts_func = self.shuffle_timeseries
---> 74 stats_ts, pvals, nums = self.get_ts_stats_significance(x, ts, stat_ts_func, null_ts_func, B=B, permute_fast=True)
75 return stats_ts, pvals, nums
76
77 def test(self):
78 print "Testing a time series with a significant mean shift"

...........................................................................
/Library/Python/2.7/site-packages/changepoint/mean_shift_model.pyc in get_ts_stats_significance(self=<changepoint.mean_shift_model.MeanShiftModel object>, x=array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18]), ts=array([0.0006972276343629123, -0.336397977492424...8727898, -0.3431935290462993, nan], dtype=object), stat_ts_func=>, null_ts_func=<bound method MeanShiftModel.shuffle_timeseries ...ngepoint.mean_shift_model.MeanShiftModel object>>, B=1000, permute_fast=True, label_ts='')
19
20 def get_ts_stats_significance(self, x, ts, stat_ts_func, null_ts_func, B=1000, permute_fast=False, label_ts=''):
21 """ Returns the statistics, pvalues and the actual number of bootstrap
22 samples. """
23 stats_ts, pvals, nums = ts_stats_significance(
---> 24 ts, stat_ts_func, null_ts_func, B=B, permute_fast=permute_fast)
25 return stats_ts, pvals, nums
26
27 def generate_null_timeseries(self, ts, mu, sigma):
28 """ Generate a time series with a given mu and sigma. This serves as the

...........................................................................
/Library/Python/2.7/site-packages/changepoint/utils/ts_stats.pyc in ts_stats_significance(ts=array([0.0006972276343629123, -0.336397977492424...8727898, -0.3431935290462993, nan], dtype=object), ts_stat_func=>, null_ts_func=<bound method MeanShiftModel.shuffle_timeseries ...ngepoint.mean_shift_model.MeanShiftModel object>>, B=1000, permute_fast=True)
53
54 def ts_stats_significance(ts, ts_stat_func, null_ts_func, B=1000, permute_fast=False):
55 """ Compute the statistical significance of a test statistic at each point
56 of the time series.
57 """
---> 58 stats_ts = ts_stat_func(ts)
59 if permute_fast:
60 # Permute it in 1 shot
61 null_ts = map(np.random.permutation, np.array([ts, ] * B))
62 else:

...........................................................................
/Library/Python/2.7/site-packages/changepoint/mean_shift_model.pyc in compute_balance_mean_ts(self=<changepoint.mean_shift_model.MeanShiftModel object>, ts=array([0.0006972276343629123, -0.336397977492424...8727898, -0.3431935290462993, nan], dtype=object))
39 """ For changed words we expect an increase in the mean, and so only 1 """
40 return np.mean(ts[t + 1:]) - np.mean(ts[:t + 1])
41
42 def compute_balance_mean_ts(self, ts):
43 """ Compute the balance at each time 't' of the time series."""
---> 44 balance = [self.compute_balance_mean(ts, t) for t in np.arange(0, len(ts) - 1)]
45 return balance
46
47 def compute_balance_median(self, ts, t):
48 """ Compute the balance at either end."""

...........................................................................
/Library/Python/2.7/site-packages/changepoint/mean_shift_model.pyc in compute_balance_mean(self=<changepoint.mean_shift_model.MeanShiftModel object>, ts=array([0.0006972276343629123, -0.336397977492424...8727898, -0.3431935290462993, nan], dtype=object), t=0)
35 return np.random.permutation(ts)
36
37 def compute_balance_mean(self, ts, t):
38 """ Compute the balance. The right end - the left end."""
39 """ For changed words we expect an increase in the mean, and so only 1 """
---> 40 return np.mean(ts[t + 1:]) - np.mean(ts[:t + 1])
t = 0
41
42 def compute_balance_mean_ts(self, ts):
43 """ Compute the balance at each time 't' of the time series."""
44 balance = [self.compute_balance_mean(ts, t) for t in np.arange(0, len(ts) - 1)]

...........................................................................
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/fromnumeric.pyc in mean(a=array([-0.3363979774924245, -0.1555133537815695,...8727898, -0.3431935290462993, nan], dtype=object), axis=None, dtype=None, out=None, keepdims=False)
2711 return mean(axis=axis, dtype=dtype, out=out)
2712 except AttributeError:
2713 pass
2714
2715 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2716 out=out, keepdims=keepdims)
2717
2718 def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
2719 """
2720 Compute the standard deviation along the specified axis.

...........................................................................
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/_methods.pyc in _mean(a=array([-0.3363979774924245, -0.1555133537815695,...8727898, -0.3431935290462993, nan], dtype=object), axis=None, dtype=None, out=None, keepdims=False)
62 ret = um.add.reduce(arr, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
63 if isinstance(ret, mu.ndarray):
64 ret = um.true_divide(
65 ret, rcount, out=ret, casting='unsafe', subok=False)
66 else:
---> 67 ret = ret.dtype.type(ret / rcount)
68
69 return ret
70
71 def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

AttributeError: 'float' object has no attribute 'dtype'


Traceback (most recent call last):
File "ngrams_pipeline.py", line 55, in
main(args)
File "ngrams_pipeline.py", line 28, in main
subprocess.check_call(cmd, shell=True)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/subprocess.py", line 540, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command 'detect_cp_distributional.sh ./working/models ./working ./output 1900 2000 5 locallinear 1000 ../examples/data/temporal_corpus/common_vocab.txt 1000 1.75 16' returned non-zero exit status 1

Any idea what that might be? Thanks a lot!

I managed to fix this bug by upgrading to numpy 0.10 where there is a builtin check to avoid the dtype issue above.

Thanks. Am closing the issue since this is fixed by numpy 0.10