Is TextGenerationEvaluator incomplete?
inf3rnus opened this issue · comments
Hey all, fascinating library you got going on here.
Was trying to get a working example for perplexity on EleutherAI/lambada_openai
with gpt2
Unfortunately, the only way I could get it working was by doing something like:
from transformers import (
AutoTokenizer,
pipeline as trans_pipeline,
)
import evaluate
from datasets import load_dataset
task = "text-generation"
task_evaluator = evaluate.evaluator(task)
dataset_name = "EleutherAI/lambada_openai"
data = load_dataset(dataset_name, split="test").shuffle(seed=42).select(range(10))
model = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model)
pipe = trans_pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
# accelerator="ort",
)
perplexity = evaluate.load("perplexity", module_type="metric")
references = data["text"]
predictions = pipe(references)
predictions = list(map(lambda prediction: prediction[0]["generated_text"], predictions))
perplexity.add_batch(predictions=predictions, references=references)
value = perplexity.compute(model_id="gpt2")
print("Perplexity is: ", value)
Okay, so that works, what doesn't is the task evaluator class for text generation. This code throws
import evaluate
from datasets import load_dataset
task = "text-generation"
task_evaluator = evaluate.evaluator(task)
dataset_name = "EleutherAI/lambada_openai"
data = load_dataset(dataset_name, split="test").shuffle(seed=42).select(range(10))
model = "gpt2"
eval_results = task_evaluator.compute(
model_or_pipeline=model,
data=data,
metric="perplexity",
# label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
)
print(eval_results)
pass
I get the error:
Exception has occurred: ValueError (note: full exception trace is shown but execution is paused at: _run_module_as_main)
Evaluation module cache file doesn't exist. Please make sure that you call `add` or `add_batch` at least once before calling `compute`.
And this is likely because compute in evaluter/base.py needs to have metric inputs returned from self.prepare_data() in the TextGenerationEvaluator class
e.g. first element of the tuple returned in this guy is empty
def prepare_data(self, data: Dataset, input_column: str, *args, **kwargs) -> Tuple[Dict, DatasetColumn]:
"""
Prepare data.
Args:
data ([`Dataset`]):
Specifies the dataset we will run evaluation on.
input_column (`str`, defaults to `"text"`):
The name of the column containing the text feature in the dataset specified by `data`.
Returns:
`dict`: metric inputs.
`list`: pipeline inputs.
"""
self.check_required_columns(data, {"input_column": input_column})
return {}, DatasetColumn(data, input_column)
but is used by this code, which has nothing to compare against when evaluating perplexity, which is why the error was produced bc
if any(v is not None for v in inputs.values()):
self.add_batch(**inputs)
never gets called before self._finalize()
in module.py (for metric.compute()
)
self._finalize()
def compute(
self,
model_or_pipeline: Union[
str,
"Pipeline",
Callable,
"PreTrainedModel",
"TFPreTrainedModel", # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
feature_extractor: Optional[
Union[str, "FeatureExtractionMixin"]
] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "text",
label_column: str = "label",
label_mapping: Optional[Dict[str, Number]] = None,
) -> Dict[str, float]:
result = {}
self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
# Prepare inputs
data = self.load_data(data=data, subset=subset, split=split)
metric_inputs, pipe_inputs = self.prepare_data(
data=data, input_column=input_column, label_column=label_column
)
pipe = self.prepare_pipeline(
model_or_pipeline=model_or_pipeline,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
device=device,
)
metric = self.prepare_metric(metric)
# Compute predictions
predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
predictions = self.predictions_processor(predictions, label_mapping)
metric_inputs.update(predictions)
# Compute metrics from references and predictions
metric_results = self.compute_metric(
metric=metric,
metric_inputs=metric_inputs,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
random_state=random_state,
)
# TODO: To clarify why `wer` and `cer` return float
# even though metric.compute contract says that it
# returns Optional[dict].
if type(metric_results) == float:
metric_results = {metric.name: metric_results}
result.update(metric_results)
result.update(perf_results)
return result
Wondering if it's still in progress, only works with some metrics for the time being, or what the deal is?
Final question - all the logic for determining how a metric is computed is held within the metric itself correct?
So e.,g. if I were trying to compute accuracy on lambada, I'd have to implement that myself?
Many thanks!