tgxs002 / HPSv2

Human Preference Score v2: A Solid Benchmark for Evaluating Human Preferences of Text-to-Image Synthesis

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Unable to reproduce the preference prediction accuracy results

vishaal27 opened this issue · comments

Hey, thanks so much for your great work and for releasing all your resources publicly.

I was interested in reproducing table 6 in your paper:
Screenshot 2024-04-05 at 1 13 55 PM

To reproduce the CLIP-ViT-H-14 and PickScore model results, I modified your evaluation script slightly to use the following script:

# adopted from: https://github.com/tgxs002/HPSv2/blob/master/hpsv2/evaluation.py

import os
import json
import numpy as np
from tqdm import tqdm
from argparse import ArgumentParser
from PIL import Image
from tqdm import tqdm
import huggingface_hub

import torch
from torch.utils.data import Dataset, DataLoader

from open_clip import create_model_and_transforms, get_tokenizer
from hpsv2.utils import root_path, hps_version_map
from transformers import AutoModel, AutoProcessor

### copied from: https://github.com/tgxs002/HPSv2/blob/866735ecaae999fa714bd9edfa05aa2672669ee3/hpsv2/src/training/train.py#L358
def inversion_score(p1, p2):
    assert len(p1) == len(p2), f'{len(p1)}, {len(p2)}'
    n = len(p1)
    cnt = 0
    for i in range(n-1):
        for j in range(i+1, n):
            if p1[i] > p1[j] and p2[i] < p2[j]:
                cnt += 1
            elif p1[i] < p1[j] and p2[i] > p2[j]:
                cnt += 1
    return 1 - cnt / (n * (n - 1) / 2)

class RankingDataset(Dataset):
    def __init__(self, meta_file, image_folder, transforms, tokenizer):
        self.transforms = transforms
        self.image_folder = image_folder     
        self.open_image = Image.open
        self.tokenizer = tokenizer

        with open(meta_file, 'r') as f:
            self.test_dict = json.load(f)
    
    def __len__(self):
        return len(self.test_dict)

    def __getitem__(self, idx):
        try:
            dict_ = self.test_dict[idx]
            if self.transforms is not None:
                images = [self.transforms(self.open_image(os.path.join(self.image_folder, file_names))) for file_names in dict_['image_path']]
            else:
                images = [self.open_image(os.path.join(self.image_folder, file_names)) for file_names in dict_['image_path']]

            paths = [os.path.join(self.image_folder, file_names) for file_names in dict_['image_path']]
            label = dict_['rank']
            if self.tokenizer is None:
                caption = dict_['prompt']
            else:
                caption = self.tokenizer(dict_['prompt'])
            return images, paths, label, caption
        except Exception as e:
            raise e
            # return self.__getitem__((idx + 1) % len(self))

def evaluate_rank(data_path, image_folder, model, batch_size, preprocess_val, tokenizer, device):
    meta_file = data_path + '/hpdv2_test.json' # this is taken from: https://huggingface.co/datasets/ymhao/HPDv2/tree/main
    dataset = RankingDataset(meta_file, image_folder, preprocess_val, None)

    score = 0
    total = len(dataset)
    all_rankings = []
    with torch.inference_mode(), torch.cuda.amp.autocast():
        for sample in tqdm(dataset, total=len(dataset), ascii=True):
            images, paths, labels, caption = sample

            processed_images = tokenizer(
                images=images,
                padding=True,
                truncation=True,
                max_length=77,
                return_tensors="pt",
            ).to(device)
            image_tensor = model.get_image_features(**processed_images)

            c1 = tokenizer(
                text=caption,
                padding=True,
                truncation=True,
                max_length=77,
                return_tensors="pt",
            ).to(device)
            caption_tensor = model.get_text_features(**c1)

            image_tensor /= image_tensor.norm(dim=-1, keepdim=True)
            caption_tensor /= caption_tensor.norm(dim=-1, keepdim=True)

            num_images = image_tensor.shape[0]

            logits_per_image = model.logit_scale.exp() * image_tensor @ caption_tensor.T
            logits_per_image = logits_per_image.squeeze(-1)

            predicted = list(torch.argsort(-logits_per_image).cpu().numpy())

            score += inversion_score(predicted, labels)

    print('ranking_acc:', score/total)

def initialize_model():

    device = "cuda"
    processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
    model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1" # for pickscore
    # model_pretrained_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" # for clip model
    processor = AutoProcessor.from_pretrained(processor_name_or_path)
    model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(device)

    return model, processor

def evaluate(mode: str, root_dir: str, data_path: str = os.path.join(root_path,'datasets/benchmark'), checkpoint_path: str = None, batch_size: int = 20, hps_version: str = "v2.1") -> None:
    
    model, processor = initialize_model()
    
    evaluate_rank(data_path, root_dir, model, batch_size, None, processor, device='cuda')

if __name__ == '__main__':
    # Parse arguments
    parser = ArgumentParser()
    parser.add_argument('--data-type', type=str, required=True, choices=['benchmark', 'benchmark_all', 'test', 'ImageReward', 'drawbench'])
    # this is the path to the folder where the test json is located
    parser.add_argument('--data-path', type=str, required=True, help='path to dataset')
    # this is the path to the folder where the test images are located
    parser.add_argument('--image-path', type=str, required=True, help='path to image files')
    parser.add_argument('--checkpoint', type=str, default=os.path.join(root_path,'HPS_v2_compressed.pt'), help='path to checkpoint')
    parser.add_argument('--batch-size', type=int, default=20)
    args = parser.parse_args()
    
    evaluate(mode=args.data_type, data_path=args.data_path, root_dir=args.image_path, checkpoint_path=args.checkpoint, batch_size=args.batch_size)

When I run this script with both CLIP and PickScore on the HPDv2 test set, I get the following results:

Model Results from paper Reproduced results with the above script
CLIP ViT-H-14 65.10 50.50
PickScore 79.80 48.84

Could you please check and verify this script and the numbers to see if there was any issue from my side in reproducing the numbers? I am unable to figure out any other reasons for why I cannot reproduce the numbers from table 6. Thanks!

Hey @tgxs002, bumping this up slightly, could you please verify this? Thanks!