Unable to reproduce the preference prediction accuracy results
vishaal27 opened this issue · comments
Hey, thanks so much for your great work and for releasing all your resources publicly.
I was interested in reproducing table 6 in your paper:
To reproduce the CLIP-ViT-H-14 and PickScore model results, I modified your evaluation script slightly to use the following script:
# adopted from: https://github.com/tgxs002/HPSv2/blob/master/hpsv2/evaluation.py
import os
import json
import numpy as np
from tqdm import tqdm
from argparse import ArgumentParser
from PIL import Image
from tqdm import tqdm
import huggingface_hub
import torch
from torch.utils.data import Dataset, DataLoader
from open_clip import create_model_and_transforms, get_tokenizer
from hpsv2.utils import root_path, hps_version_map
from transformers import AutoModel, AutoProcessor
### copied from: https://github.com/tgxs002/HPSv2/blob/866735ecaae999fa714bd9edfa05aa2672669ee3/hpsv2/src/training/train.py#L358
def inversion_score(p1, p2):
assert len(p1) == len(p2), f'{len(p1)}, {len(p2)}'
n = len(p1)
cnt = 0
for i in range(n-1):
for j in range(i+1, n):
if p1[i] > p1[j] and p2[i] < p2[j]:
cnt += 1
elif p1[i] < p1[j] and p2[i] > p2[j]:
cnt += 1
return 1 - cnt / (n * (n - 1) / 2)
class RankingDataset(Dataset):
def __init__(self, meta_file, image_folder, transforms, tokenizer):
self.transforms = transforms
self.image_folder = image_folder
self.open_image = Image.open
self.tokenizer = tokenizer
with open(meta_file, 'r') as f:
self.test_dict = json.load(f)
def __len__(self):
return len(self.test_dict)
def __getitem__(self, idx):
try:
dict_ = self.test_dict[idx]
if self.transforms is not None:
images = [self.transforms(self.open_image(os.path.join(self.image_folder, file_names))) for file_names in dict_['image_path']]
else:
images = [self.open_image(os.path.join(self.image_folder, file_names)) for file_names in dict_['image_path']]
paths = [os.path.join(self.image_folder, file_names) for file_names in dict_['image_path']]
label = dict_['rank']
if self.tokenizer is None:
caption = dict_['prompt']
else:
caption = self.tokenizer(dict_['prompt'])
return images, paths, label, caption
except Exception as e:
raise e
# return self.__getitem__((idx + 1) % len(self))
def evaluate_rank(data_path, image_folder, model, batch_size, preprocess_val, tokenizer, device):
meta_file = data_path + '/hpdv2_test.json' # this is taken from: https://huggingface.co/datasets/ymhao/HPDv2/tree/main
dataset = RankingDataset(meta_file, image_folder, preprocess_val, None)
score = 0
total = len(dataset)
all_rankings = []
with torch.inference_mode(), torch.cuda.amp.autocast():
for sample in tqdm(dataset, total=len(dataset), ascii=True):
images, paths, labels, caption = sample
processed_images = tokenizer(
images=images,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt",
).to(device)
image_tensor = model.get_image_features(**processed_images)
c1 = tokenizer(
text=caption,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt",
).to(device)
caption_tensor = model.get_text_features(**c1)
image_tensor /= image_tensor.norm(dim=-1, keepdim=True)
caption_tensor /= caption_tensor.norm(dim=-1, keepdim=True)
num_images = image_tensor.shape[0]
logits_per_image = model.logit_scale.exp() * image_tensor @ caption_tensor.T
logits_per_image = logits_per_image.squeeze(-1)
predicted = list(torch.argsort(-logits_per_image).cpu().numpy())
score += inversion_score(predicted, labels)
print('ranking_acc:', score/total)
def initialize_model():
device = "cuda"
processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1" # for pickscore
# model_pretrained_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" # for clip model
processor = AutoProcessor.from_pretrained(processor_name_or_path)
model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(device)
return model, processor
def evaluate(mode: str, root_dir: str, data_path: str = os.path.join(root_path,'datasets/benchmark'), checkpoint_path: str = None, batch_size: int = 20, hps_version: str = "v2.1") -> None:
model, processor = initialize_model()
evaluate_rank(data_path, root_dir, model, batch_size, None, processor, device='cuda')
if __name__ == '__main__':
# Parse arguments
parser = ArgumentParser()
parser.add_argument('--data-type', type=str, required=True, choices=['benchmark', 'benchmark_all', 'test', 'ImageReward', 'drawbench'])
# this is the path to the folder where the test json is located
parser.add_argument('--data-path', type=str, required=True, help='path to dataset')
# this is the path to the folder where the test images are located
parser.add_argument('--image-path', type=str, required=True, help='path to image files')
parser.add_argument('--checkpoint', type=str, default=os.path.join(root_path,'HPS_v2_compressed.pt'), help='path to checkpoint')
parser.add_argument('--batch-size', type=int, default=20)
args = parser.parse_args()
evaluate(mode=args.data_type, data_path=args.data_path, root_dir=args.image_path, checkpoint_path=args.checkpoint, batch_size=args.batch_size)
When I run this script with both CLIP and PickScore on the HPDv2 test set, I get the following results:
Model | Results from paper | Reproduced results with the above script |
---|---|---|
CLIP ViT-H-14 | 65.10 | 50.50 |
PickScore | 79.80 | 48.84 |
Could you please check and verify this script and the numbers to see if there was any issue from my side in reproducing the numbers? I am unable to figure out any other reasons for why I cannot reproduce the numbers from table 6. Thanks!
Hey @tgxs002, bumping this up slightly, could you please verify this? Thanks!