Custom aggregate function results in attribute error.
ashutosh-dwivedi-e3502 opened this issue · comments
Writing custom expectation aggregate functions for test cases results in attribute error.
Following examples demonstrate this :
import checklist
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from checklist.pred_wrapper import PredictorWrapper
from checklist.expect import Expect
from checklist.test_types import INV
from checklist.perturb import Perturb
dataset = [
'I am checking the checklist',
'There is a bug in the code',
]
class Model(object):
THRESHOLD = 0.9
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
self.model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
def _mean_pooling(self, model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
def get_encoding(self, sentences):
encoded_input = self.tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
with torch.no_grad():
model_output = self.model(**encoded_input)
return self._mean_pooling(model_output, encoded_input['attention_mask'])
def get_similarities(self, sentence1, other_sentences):
e1 = self.get_encoding(str(sentence1))
e2 = self.get_encoding([str(x) for x in other_sentences])
return np.squeeze(cosine_similarity(e1, e2))
def similarity_score(inputs):
all_preds = list()
for sentence1, other_sentences in inputs:
scores = model.get_similarities(sentence1, other_sentences)
all_preds.append(scores)
return np.array(all_preds)
def all_similar(x, pred, conf, label=None, meta=None):
"""if any of the results is is below threshold testcase doesn't pass"""
ret = np.sum(pred < Model.THRESHOLD) == 0
print(f'pred = {pred}, ret = {ret}')
return ret
def add_typos(sentence, n=5):
typos = []
for i in range(n):
typos.append(Perturb.perturb([sentence], Perturb.add_typos, keep_original=False))
return sentence, typos
wrapped_pp = PredictorWrapper.wrap_predict(similarity_score)
expect_all_similar = Expect.single(all_similar)
model = Model()
t = Perturb.perturb(dataset, add_typos, nsamples=200, keep_original=False)
test = INV(**t, name='add typos', capability='typo',
description='', expect=expect_all_similar, agg_fn=expect_all_similar)
test.run(predict_and_confidence_fn=wrapped_pp, overwrite=True, verbose=True)
test.summary()
This results in the following exception :
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-38-0b1b8f7c7467> in <module>
1 test = INV(**t, name='add typos', capability='typo',
2 description='', expect=expect_all_similar, agg_fn=expect_all_similar)
----> 3 test.run(predict_and_confidence_fn=wrapped_pp, overwrite=True, verbose=True)
4 test.summary()
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in run(self, predict_and_confidence_fn, overwrite, verbose, n, seed)
351 print('Predicting %d examples' % len(examples))
352 preds, confs = predict_and_confidence_fn(examples)
--> 353 self.run_from_preds_confs(preds, confs, overwrite=overwrite)
354
355 def fail_idxs(self):
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in run_from_preds_confs(self, preds, confs, overwrite)
291 self._check_create_results(overwrite)
292 self.update_results_from_preds(preds, confs)
--> 293 self.update_expect()
294
295 def run_from_file(self, path, file_format=None, format_fn=None, ignore_header=False, overwrite=False):
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in update_expect(self)
128 self._check_results()
129 self.results.expect_results = self.expect(self)
--> 130 self.results.passed = Expect.aggregate(self.results.expect_results, self.agg_fn)
131
132 def example_list_and_indices(self, n=None, seed=None):
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in aggregate(data, agg_fn)
145 # data is a list of lists or list of np.arrays
146 # import pdb; pdb.set_trace()
--> 147 return np.array([Expect.aggregate_testcase(x, agg_fn) for x in data])
148
149 @staticmethod
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in <listcomp>(.0)
145 # data is a list of lists or list of np.arrays
146 # import pdb; pdb.set_trace()
--> 147 return np.array([Expect.aggregate_testcase(x, agg_fn) for x in data])
148
149 @staticmethod
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in aggregate_testcase(expect_results, agg_fn)
160 return None
161 else:
--> 162 return agg_fn(np.array(r))
163
164 @staticmethod
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in expect(self)
75 """
76 def expect(self):
---> 77 zipped = iter_with_optional(self.data, self.results.preds, self.results.confs, self.labels, self.meta, self.run_idxs)
78 return [fn(x, pred, confs, labels, meta) for x, pred, confs, labels, meta in zipped]
79 return expect
AttributeError: 'numpy.ndarray' object has no attribute 'results'
This happens due to the fact that return agg_fn(np.array(r))
line in aggregate_testcase
passes a numpy array to the aggregate functions however the signature of this functions is all_similar(x, pred, conf, label=None, meta=None)
Hi!
I didn't quite understand why you would want agg_fn=expect_all_similar
-- In your case, I believe:
- You only need to customize the expectation function on each test case, which is what you are doing with
expect=expect_all_similar
- For
agg_fn
, the aggregation doesn't matter for you and you should just leave it as default, because essentially each of your test cases just has one example that looks like(original, perturbed[])
. This function is also not for setting expectations on each example, but for aggregating CheckList formatted results on multiple examples, and therefore it's taking a different input format thanexpect=
(i.e. cannot recognizenumpy.ndarray
). For more onagg_fn
, please see this tutorial and this doc.
I'd change two things in your code:
add_typos
function -- The CheckList Perturb wrapper givesmunch
(fancy dict), not raw strings. To get the strings, you will want to do:
def add_typos(sentence, n=5):
typos = []
for i in range(n):
typos.append(Perturb.add_typos(sentence))
# this equals to:
# typos.append(Perturb.perturb([sentence], Perturb.add_typos, keep_original=False).data[0][0])
return sentence, typos
The return before and after the change, if you run t.data
:
# before
[[('I am checking the checklist',
[MunchWithAdd({'data': [['I am checking th echecklist']]}),
MunchWithAdd({'data': [['I am chekcing the checklist']]}),
MunchWithAdd({'data': [['I am cehcking the checklist']]}),
MunchWithAdd({'data': [['I am chekcing the checklist']]}),
MunchWithAdd({'data': [['I am checknig the checklist']]})])],
[('There is a bug in the code',
[MunchWithAdd({'data': [['Theer is a bug in the code']]}),
MunchWithAdd({'data': [['There is a bug in the cdoe']]}),
MunchWithAdd({'data': [['Ther eis a bug in the code']]}),
MunchWithAdd({'data': [['There is a bgu in the code']]}),
MunchWithAdd({'data': [['There i sa bug in the code']]})])]]
# after
[[('I am checking the checklist',
['I a mchecking the checklist',
'I am checkign the checklist',
'I am checkingt he checklist',
'I am checking the checklits',
'I am checking the checklits'])],
[('There is a bug in the code',
['There si a bug in the code',
'There is a bug i nthe code',
'hTere is a bug in the code',
'There is a bug in teh code',
'There is ab ug in the code'])]]
- Remove the unnecessary (and incorrectly defined)
agg_fn
:
test = INV(**t, name='add typos', capability='typo', description='', expect=expect_all_similar)
Side note: If you want your test cases to contain multiple examples, with each example being a string tuple (original, perturbed)
, you would want to do the following:
# get tupled data from add_typos
def add_typos(sentence, n=5):
typos = []
for i in range(n):
#typos.append(Perturb.perturb([sentence], Perturb.add_typos, keep_original=False))
typos.append((sentence, Perturb.add_typos(sentence)))
return typos
t = Perturb.perturb(dataset, add_typos, nsamples=200, keep_original=False)
t.data
"""
# would be:
[[('There is a bug in the code', 'There is a bgu in the code'),
('There is a bug in the code', 'There is a bug int he code'),
('There is a bug in the code', 'There is a bug in th ecode'),
('There is a bug in the code', 'There is a bug in th ecode'),
('There is a bug in the code', 'There is a bug in the coed')],
[('I am checking the checklist', 'I am cehcking the checklist'),
('I am checking the checklist', ' Iam checking the checklist'),
('I am checking the checklist', 'I am checking teh checklist'),
('I am checking the checklist', 'I am checking th echecklist'),
('I am checking the checklist', 'I a mchecking the checklist')]]
"""
# similarity function should be between two sentences
def similarity_score(inputs):
return np.array([model.get_similarities(i[0], [i[1]]) for i in inputs])
wrapped_pp = PredictorWrapper.wrap_predict(similarity_score)
# expect=a function that says the similarity between a sentence pair needs to be < threshold
def all_similar(x, pred, conf, label=None, meta=None):
return int(pred > Model.THRESHOLD)
expect_all_similar = Expect.single(all_similar)
# `agg_fn='all'` (the default), which says we expect ALL the examples to behave similarly, i.e., all the pairs have similarity < threshold.
test = INV(**t, name='add typos', capability='typo',
description='', expect=expect_all_similar, agg_fn="all")
test.run(predict_and_confidence_fn=wrapped_pp, overwrite=True, verbose=True)
test.summary()
I hope this makes sense, and please feel free to reopen if this doesn't answer your question :)
@tongshuangwu Thanks! Specially for the side note. That does make sense.
There's a small change, I'd make a note of it here for someone looking at it in the future.
all_similar
function should look at predictions instead because conf returned by wrap_predict
is always 1 :
def all_similar(x, pred, conf, label=None, meta=None):
return np.sum(pred > Model.THRESHOLD)
Ah yes, sorry :) Fixed!
Also test.summary
function in the pairwise example always return the Example fails as None
. Should I create a separate issue for that?
Predicting 10 examples
Test cases: 2
Fails (rate): 2 (100.0%)
Example fails:
None
None
None
----
None
None
None
----
If I use the all_similar function as :
def all_similar(x, pred, conf, label=None, meta=None):
return int(pred > Model.THRESHOLD)
It raises the following exception :
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-108-c2448cc7e16d> in <module>
2 description='', expect=expect_all_similar, agg_fn="all")
3
----> 4 test.run(predict_and_confidence_fn=wrapped_pp, overwrite=True, verbose=True)
5 test.summary()
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in run(self, predict_and_confidence_fn, overwrite, verbose, n, seed)
351 print('Predicting %d examples' % len(examples))
352 preds, confs = predict_and_confidence_fn(examples)
--> 353 self.run_from_preds_confs(preds, confs, overwrite=overwrite)
354
355 def fail_idxs(self):
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in run_from_preds_confs(self, preds, confs, overwrite)
291 self._check_create_results(overwrite)
292 self.update_results_from_preds(preds, confs)
--> 293 self.update_expect()
294
295 def run_from_file(self, path, file_format=None, format_fn=None, ignore_header=False, overwrite=False):
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in update_expect(self)
127 def update_expect(self):
128 self._check_results()
--> 129 self.results.expect_results = self.expect(self)
130 self.results.passed = Expect.aggregate(self.results.expect_results, self.agg_fn)
131
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in expect(self)
76 def expect(self):
77 zipped = iter_with_optional(self.data, self.results.preds, self.results.confs, self.labels, self.meta, self.run_idxs)
---> 78 return [fn(x, pred, confs, labels, meta) for x, pred, confs, labels, meta in zipped]
79 return expect
80
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in <listcomp>(.0)
76 def expect(self):
77 zipped = iter_with_optional(self.data, self.results.preds, self.results.confs, self.labels, self.meta, self.run_idxs)
---> 78 return [fn(x, pred, confs, labels, meta) for x, pred, confs, labels, meta in zipped]
79 return expect
80
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in expect_fn(xs, preds, confs, label, meta)
96 """
97 def expect_fn(xs, preds, confs, label=None, meta=None):
---> 98 return np.array([fn(x, p, c, l, m) for x, p, c, l, m in iter_with_optional(xs, preds, confs, label, meta)])
99 return Expect.testcase(expect_fn)#, agg_fn)
100
~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in <listcomp>(.0)
96 """
97 def expect_fn(xs, preds, confs, label=None, meta=None):
---> 98 return np.array([fn(x, p, c, l, m) for x, p, c, l, m in iter_with_optional(xs, preds, confs, label, meta)])
99 return Expect.testcase(expect_fn)#, agg_fn)
100
<ipython-input-104-935c7fc5af22> in all_similar(x, pred, conf, label, meta)
1 def all_similar(x, pred, conf, label=None, meta=None):
----> 2 return int(pred > Model.THRESHOLD)
3
4 expect_all_similar = Expect.single(all_similar)
TypeError: only size-1 arrays can be converted to Python scalars
No need to open a new issue, we can just discuss here.
test.summary
function in the pairwise example always return the Example fails as None. Should I create a separate issue for that?
This is because your prediction function is a bit non-conventional that the default formatter does not recognize what to do.
To print the results as you need, you can overwrite the formatter like this:
def format_fn(x, pred, conf, *args, **kwargs):
return '%.1f %s' % (pred, str(x))
test.summary(format_example_fn=format_fn)
Which will give you something like:
Example fails:
0.8 ('There is a bug in the code', 'There is a bug in the ocde')
0.8 ('There is a bug in the code', 'There is a bug in the cdoe')
0.7 ('There is a bug in the code', 'There is a bug in th ecode')
----
0.8 ('I am checking the checklist', 'I am checking the checlkist')
0.9 ('I am checking the checklist', 'I am checknig the checklist')
0.9 ('I am checking the checklist', 'I am chceking the checklist')
----
It raises the following exception : ...
Please check your similarity_score
function? I suspect you used i[1]
instead of [i[1]]
and as a result your model treats each of your perturbed example as multiple examples. The correct output should be:
def similarity_score(inputs):
predictions = np.array([model.get_similarities(i[0], [i[1]]) for i in inputs])
print(inputs)
print(predictions)
return predictions
[('There is a bug in the code', 'There is a bug in the ocde'), ('There is a bug in the code', 'There is a bug in the cdoe'), ('There is a bug in the code', 'There is a bug in th ecode'), ('There is a bug in the code', 'There is a bug in th ecode'), ('There is a bug in the code', 'hTere is a bug in the code'), ('I am checking the checklist', 'I am checking the checlkist'), ('I am checking the checklist', 'I am checknig the checklist'), ('I am checking the checklist', 'I am checking hte checklist'), ('I am checking the checklist', 'I am chceking the checklist'), ('I am checking the checklist', 'I am checkign the checklist')]
[0.8464688 0.83060527 0.72680354 0.72680354 0.9316598 0.7736648
0.8513911 0.951548 0.8679485 0.8926376 ]
@tongshuangwu Sure. Thanks.
I still see a problem that the summary test.summary() function doesn't print the failed examples in the output, in this case.
Since we mark the whole test case as passed or failed.
One of the output being printed here has similarity score of 1, which above the threshold which we consider as similar.
How can we fix this ?
I've given the summary output below:
Predicting 10 examples
[('There is a bug in the code', 'There si a bug in the code'), ('There is a bug in the code', 'There is a bugi n the code'), ('There is a bug in the code', 'There is a ubg in the code'), ('There is a bug in the code', 'hTere is a bug in the code'), ('There is a bug in the code', 'There is a bug in the coed'), ('I am checking the checklist', 'I am checking the checlkist'), ('I am checking the checklist', 'I am chceking the checklist'), ('I am checking the checklist', 'I am checking teh checklist'), ('I am checking the checklist', 'I am checknig the checklist'), ('I am checking the checklist', 'I am checknig the checklist')]
[0.97615945 0.9164431 0.5684959 0.9316598 0.7938423 0.7736647
0.86794853 0.9417565 0.85139114 0.85139114]
Test cases: 2
Fails (rate): 1 (50.0%)
Example fails:
1.0 ('There is a bug in the code', 'There si a bug in the code')
0.6 ('There is a bug in the code', 'There is a ubg in the code')
Essentially this is because what you are defining should be an MFT test (i.e., in each case you have an expectation being "label" > 0.9) not invariant/directional prediction.
In INV/MFT, there's a default setting to always print the first test case regardless of its predicted output, because that's usually the original example before perturbation. In your case, the first one is already an actual test case.
So the simple solution is:
change
test = INV(**t, name='add typos', capability='typo',
description='', expect=expect_all_similar, agg_fn="all")
to
test = MFT(**t, name='add typos', capability='typo',
description='', expect=expect_all_similar, agg_fn="all")
(Sorry for the prev. confusions -- I also didn't realize this until now!)