marcotcr / checklist

Beyond Accuracy: Behavioral Testing of NLP models with CheckList

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Custom aggregate function results in attribute error.

ashutosh-dwivedi-e3502 opened this issue · comments

Writing custom expectation aggregate functions for test cases results in attribute error.

Following examples demonstrate this :


import checklist
import torch
import numpy as np

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from checklist.pred_wrapper import PredictorWrapper
from checklist.expect import Expect
from checklist.test_types import INV
from checklist.perturb import Perturb

dataset = [
    'I am checking the checklist',
    'There is a bug in the code',
]


class Model(object):
    THRESHOLD = 0.9

    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
        self.model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def get_encoding(self, sentences):
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            return self._mean_pooling(model_output, encoded_input['attention_mask'])

    def get_similarities(self, sentence1, other_sentences):    
        e1 = self.get_encoding(str(sentence1))
        e2 = self.get_encoding([str(x) for x in other_sentences])
        return np.squeeze(cosine_similarity(e1, e2))


def similarity_score(inputs):
    all_preds = list()
    for sentence1, other_sentences in inputs:
        scores = model.get_similarities(sentence1, other_sentences)
        all_preds.append(scores)
    return np.array(all_preds)


def all_similar(x, pred, conf, label=None, meta=None):
    """if any of the results is is below threshold testcase doesn't pass"""
    ret = np.sum(pred < Model.THRESHOLD) == 0
    print(f'pred = {pred}, ret = {ret}')
    return ret

def add_typos(sentence, n=5):
    typos = []
    for i in range(n):
        typos.append(Perturb.perturb([sentence], Perturb.add_typos, keep_original=False))
    return sentence, typos


wrapped_pp = PredictorWrapper.wrap_predict(similarity_score)
expect_all_similar = Expect.single(all_similar)

model = Model()

t = Perturb.perturb(dataset, add_typos, nsamples=200, keep_original=False)
test = INV(**t, name='add typos', capability='typo',
           description='', expect=expect_all_similar, agg_fn=expect_all_similar)

test.run(predict_and_confidence_fn=wrapped_pp, overwrite=True, verbose=True)
test.summary()

This results in the following exception :

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-38-0b1b8f7c7467> in <module>
      1 test = INV(**t, name='add typos', capability='typo',
      2            description='', expect=expect_all_similar, agg_fn=expect_all_similar)
----> 3 test.run(predict_and_confidence_fn=wrapped_pp, overwrite=True, verbose=True)
      4 test.summary()

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in run(self, predict_and_confidence_fn, overwrite, verbose, n, seed)
    351             print('Predicting %d examples' % len(examples))
    352         preds, confs = predict_and_confidence_fn(examples)
--> 353         self.run_from_preds_confs(preds, confs, overwrite=overwrite)
    354 
    355     def fail_idxs(self):

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in run_from_preds_confs(self, preds, confs, overwrite)
    291         self._check_create_results(overwrite)
    292         self.update_results_from_preds(preds, confs)
--> 293         self.update_expect()
    294 
    295     def run_from_file(self, path, file_format=None, format_fn=None, ignore_header=False, overwrite=False):

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in update_expect(self)
    128         self._check_results()
    129         self.results.expect_results = self.expect(self)
--> 130         self.results.passed = Expect.aggregate(self.results.expect_results, self.agg_fn)
    131 
    132     def example_list_and_indices(self, n=None, seed=None):

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in aggregate(data, agg_fn)
    145         # data is a list of lists or list of np.arrays
    146         # import pdb; pdb.set_trace()
--> 147         return np.array([Expect.aggregate_testcase(x, agg_fn) for x in data])
    148 
    149     @staticmethod

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in <listcomp>(.0)
    145         # data is a list of lists or list of np.arrays
    146         # import pdb; pdb.set_trace()
--> 147         return np.array([Expect.aggregate_testcase(x, agg_fn) for x in data])
    148 
    149     @staticmethod

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in aggregate_testcase(expect_results, agg_fn)
    160             return None
    161         else:
--> 162             return agg_fn(np.array(r))
    163 
    164     @staticmethod

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in expect(self)
     75         """
     76         def expect(self):
---> 77             zipped = iter_with_optional(self.data, self.results.preds, self.results.confs, self.labels, self.meta, self.run_idxs)
     78             return [fn(x, pred, confs, labels, meta) for x, pred, confs, labels, meta in zipped]
     79         return expect

AttributeError: 'numpy.ndarray' object has no attribute 'results'

This happens due to the fact that return agg_fn(np.array(r)) line in aggregate_testcase passes a numpy array to the aggregate functions however the signature of this functions is all_similar(x, pred, conf, label=None, meta=None)

Hi!
I didn't quite understand why you would want agg_fn=expect_all_similar -- In your case, I believe:

  • You only need to customize the expectation function on each test case, which is what you are doing with expect=expect_all_similar
  • For agg_fn, the aggregation doesn't matter for you and you should just leave it as default, because essentially each of your test cases just has one example that looks like (original, perturbed[]). This function is also not for setting expectations on each example, but for aggregating CheckList formatted results on multiple examples, and therefore it's taking a different input format than expect= (i.e. cannot recognize numpy.ndarray). For more on agg_fn, please see this tutorial and this doc.

I'd change two things in your code:

  1. add_typos function -- The CheckList Perturb wrapper gives munch (fancy dict), not raw strings. To get the strings, you will want to do:
def add_typos(sentence, n=5):
    typos = []
    for i in range(n):
        typos.append(Perturb.add_typos(sentence))
        # this equals to:
        # typos.append(Perturb.perturb([sentence], Perturb.add_typos, keep_original=False).data[0][0])
    return sentence, typos

The return before and after the change, if you run t.data:

# before
[[('I am checking the checklist',
   [MunchWithAdd({'data': [['I am checking th echecklist']]}),
    MunchWithAdd({'data': [['I am chekcing the checklist']]}),
    MunchWithAdd({'data': [['I am cehcking the checklist']]}),
    MunchWithAdd({'data': [['I am chekcing the checklist']]}),
    MunchWithAdd({'data': [['I am checknig the checklist']]})])],
 [('There is a bug in the code',
   [MunchWithAdd({'data': [['Theer is a bug in the code']]}),
    MunchWithAdd({'data': [['There is a bug in the cdoe']]}),
    MunchWithAdd({'data': [['Ther eis a bug in the code']]}),
    MunchWithAdd({'data': [['There is a bgu in the code']]}),
    MunchWithAdd({'data': [['There i sa bug in the code']]})])]]


# after
[[('I am checking the checklist',
   ['I a mchecking the checklist',
    'I am checkign the checklist',
    'I am checkingt he checklist',
    'I am checking the checklits',
    'I am checking the checklits'])],
 [('There is a bug in the code',
   ['There si a bug in the code',
    'There is a bug i nthe code',
    'hTere is a bug in the code',
    'There is a bug in teh code',
    'There is ab ug in the code'])]]
  1. Remove the unnecessary (and incorrectly defined) agg_fn:
test = INV(**t, name='add typos', capability='typo', description='', expect=expect_all_similar)

Side note: If you want your test cases to contain multiple examples, with each example being a string tuple (original, perturbed), you would want to do the following:

# get tupled data from add_typos
def add_typos(sentence, n=5):
    typos = []
    for i in range(n):
        #typos.append(Perturb.perturb([sentence], Perturb.add_typos, keep_original=False))
        typos.append((sentence, Perturb.add_typos(sentence)))
    return typos

t = Perturb.perturb(dataset, add_typos, nsamples=200, keep_original=False)
t.data
"""
# would be:
[[('There is a bug in the code', 'There is a bgu in the code'),
  ('There is a bug in the code', 'There is a bug int he code'),
  ('There is a bug in the code', 'There is a bug in th ecode'),
  ('There is a bug in the code', 'There is a bug in th ecode'),
  ('There is a bug in the code', 'There is a bug in the coed')],
 [('I am checking the checklist', 'I am cehcking the checklist'),
  ('I am checking the checklist', ' Iam checking the checklist'),
  ('I am checking the checklist', 'I am checking teh checklist'),
  ('I am checking the checklist', 'I am checking th echecklist'),
  ('I am checking the checklist', 'I a mchecking the checklist')]]
"""

# similarity function should be between two sentences
def similarity_score(inputs):
    return np.array([model.get_similarities(i[0], [i[1]]) for i in inputs])
wrapped_pp = PredictorWrapper.wrap_predict(similarity_score)

# expect=a function that says the similarity between a sentence pair needs to be < threshold
def all_similar(x, pred, conf, label=None, meta=None):
    return int(pred > Model.THRESHOLD)
expect_all_similar = Expect.single(all_similar)
# `agg_fn='all'` (the default), which says we expect ALL the examples to behave similarly, i.e., all the pairs have similarity < threshold.
test = INV(**t, name='add typos', capability='typo',
           description='', expect=expect_all_similar, agg_fn="all")

test.run(predict_and_confidence_fn=wrapped_pp, overwrite=True, verbose=True)
test.summary()

I hope this makes sense, and please feel free to reopen if this doesn't answer your question :)

@tongshuangwu Thanks! Specially for the side note. That does make sense.

There's a small change, I'd make a note of it here for someone looking at it in the future.
all_similar function should look at predictions instead because conf returned by wrap_predict is always 1 :

def all_similar(x, pred, conf, label=None, meta=None):
    return np.sum(pred > Model.THRESHOLD)

Ah yes, sorry :) Fixed!

Also test.summary function in the pairwise example always return the Example fails as None. Should I create a separate issue for that?

Predicting 10 examples
Test cases:      2
Fails (rate):    2 (100.0%)

Example fails:
None
None
None

----
None
None
None

----

If I use the all_similar function as :

def all_similar(x, pred, conf, label=None, meta=None):
    return int(pred > Model.THRESHOLD)

It raises the following exception :

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-108-c2448cc7e16d> in <module>
      2            description='', expect=expect_all_similar, agg_fn="all")
      3 
----> 4 test.run(predict_and_confidence_fn=wrapped_pp, overwrite=True, verbose=True)
      5 test.summary()

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in run(self, predict_and_confidence_fn, overwrite, verbose, n, seed)
    351             print('Predicting %d examples' % len(examples))
    352         preds, confs = predict_and_confidence_fn(examples)
--> 353         self.run_from_preds_confs(preds, confs, overwrite=overwrite)
    354 
    355     def fail_idxs(self):

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in run_from_preds_confs(self, preds, confs, overwrite)
    291         self._check_create_results(overwrite)
    292         self.update_results_from_preds(preds, confs)
--> 293         self.update_expect()
    294 
    295     def run_from_file(self, path, file_format=None, format_fn=None, ignore_header=False, overwrite=False):

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/abstract_test.py in update_expect(self)
    127     def update_expect(self):
    128         self._check_results()
--> 129         self.results.expect_results = self.expect(self)
    130         self.results.passed = Expect.aggregate(self.results.expect_results, self.agg_fn)
    131 

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in expect(self)
     76         def expect(self):
     77             zipped = iter_with_optional(self.data, self.results.preds, self.results.confs, self.labels, self.meta, self.run_idxs)
---> 78             return [fn(x, pred, confs, labels, meta) for x, pred, confs, labels, meta in zipped]
     79         return expect
     80 

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in <listcomp>(.0)
     76         def expect(self):
     77             zipped = iter_with_optional(self.data, self.results.preds, self.results.confs, self.labels, self.meta, self.run_idxs)
---> 78             return [fn(x, pred, confs, labels, meta) for x, pred, confs, labels, meta in zipped]
     79         return expect
     80 

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in expect_fn(xs, preds, confs, label, meta)
     96         """
     97         def expect_fn(xs, preds, confs, label=None, meta=None):
---> 98             return np.array([fn(x, p, c, l, m) for x, p, c,  l, m in iter_with_optional(xs, preds, confs, label, meta)])
     99         return Expect.testcase(expect_fn)#, agg_fn)
    100 

~/.virtualenvs/test-demo-TklxO9OB/lib/python3.8/site-packages/checklist/expect.py in <listcomp>(.0)
     96         """
     97         def expect_fn(xs, preds, confs, label=None, meta=None):
---> 98             return np.array([fn(x, p, c, l, m) for x, p, c,  l, m in iter_with_optional(xs, preds, confs, label, meta)])
     99         return Expect.testcase(expect_fn)#, agg_fn)
    100 

<ipython-input-104-935c7fc5af22> in all_similar(x, pred, conf, label, meta)
      1 def all_similar(x, pred, conf, label=None, meta=None):
----> 2     return int(pred > Model.THRESHOLD)
      3 
      4 expect_all_similar = Expect.single(all_similar)

TypeError: only size-1 arrays can be converted to Python scalars

No need to open a new issue, we can just discuss here.

test.summary function in the pairwise example always return the Example fails as None. Should I create a separate issue for that?

This is because your prediction function is a bit non-conventional that the default formatter does not recognize what to do.
To print the results as you need, you can overwrite the formatter like this:

def format_fn(x, pred, conf, *args, **kwargs):
    return '%.1f %s' % (pred, str(x))
test.summary(format_example_fn=format_fn)

Which will give you something like:

Example fails:
0.8 ('There is a bug in the code', 'There is a bug in the ocde')
0.8 ('There is a bug in the code', 'There is a bug in the cdoe')
0.7 ('There is a bug in the code', 'There is a bug in th ecode')

----
0.8 ('I am checking the checklist', 'I am checking the checlkist')
0.9 ('I am checking the checklist', 'I am checknig the checklist')
0.9 ('I am checking the checklist', 'I am chceking the checklist')

----

It raises the following exception : ...

Please check your similarity_score function? I suspect you used i[1] instead of [i[1]] and as a result your model treats each of your perturbed example as multiple examples. The correct output should be:

def similarity_score(inputs):
    predictions = np.array([model.get_similarities(i[0], [i[1]]) for i in inputs])
    print(inputs)
    print(predictions)
    return predictions
[('There is a bug in the code', 'There is a bug in the ocde'), ('There is a bug in the code', 'There is a bug in the cdoe'), ('There is a bug in the code', 'There is a bug in th ecode'), ('There is a bug in the code', 'There is a bug in th ecode'), ('There is a bug in the code', 'hTere is a bug in the code'), ('I am checking the checklist', 'I am checking the checlkist'), ('I am checking the checklist', 'I am checknig the checklist'), ('I am checking the checklist', 'I am checking hte checklist'), ('I am checking the checklist', 'I am chceking the checklist'), ('I am checking the checklist', 'I am checkign the checklist')]
[0.8464688  0.83060527 0.72680354 0.72680354 0.9316598  0.7736648
 0.8513911  0.951548   0.8679485  0.8926376 ]

@tongshuangwu Sure. Thanks.
I still see a problem that the summary test.summary() function doesn't print the failed examples in the output, in this case.
Since we mark the whole test case as passed or failed.

One of the output being printed here has similarity score of 1, which above the threshold which we consider as similar.
How can we fix this ?
I've given the summary output below:

Predicting 10 examples
[('There is a bug in the code', 'There si a bug in the code'), ('There is a bug in the code', 'There is a bugi n the code'), ('There is a bug in the code', 'There is a ubg in the code'), ('There is a bug in the code', 'hTere is a bug in the code'), ('There is a bug in the code', 'There is a bug in the coed'), ('I am checking the checklist', 'I am checking the checlkist'), ('I am checking the checklist', 'I am chceking the checklist'), ('I am checking the checklist', 'I am checking teh checklist'), ('I am checking the checklist', 'I am checknig the checklist'), ('I am checking the checklist', 'I am checknig the checklist')]
[0.97615945 0.9164431  0.5684959  0.9316598  0.7938423  0.7736647
 0.86794853 0.9417565  0.85139114 0.85139114]
Test cases:      2
Fails (rate):    1 (50.0%)

Example fails:
1.0 ('There is a bug in the code', 'There si a bug in the code')
0.6 ('There is a bug in the code', 'There is a ubg in the code')

Essentially this is because what you are defining should be an MFT test (i.e., in each case you have an expectation being "label" > 0.9) not invariant/directional prediction.

In INV/MFT, there's a default setting to always print the first test case regardless of its predicted output, because that's usually the original example before perturbation. In your case, the first one is already an actual test case.

So the simple solution is:
change

test = INV(**t, name='add typos', capability='typo',
           description='', expect=expect_all_similar, agg_fn="all")

to

test = MFT(**t, name='add typos', capability='typo',
           description='', expect=expect_all_similar, agg_fn="all")

(Sorry for the prev. confusions -- I also didn't realize this until now!)