Proper range for similarity
zhaowenZhou opened this issue · comments
I compute the cosine similarity between one image and multiple texts. The image is a picture of a vehicle and the texts are [vehicle, motor, Computer], the similarites are 0.25, 0.22, 0.19. Is this correct? I feel like the most matched pair should be close to 1 and the most unmatched one is close to 0 should be more intuitive...
Here's my code
`
from torchvision import transforms
import PIL
from PIL import Image
import sys
sys.path.append("./clip")
import clip
import torch
from torch import nn
criterion = nn.CosineSimilarity(dim=1, eps=1e-6)
encode_img
T = transforms.Compose([
transforms.Resize((224,224),interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
])
device = 'cuda:0'
model, preprocess = clip.load('ViT-B/32', device, jit=False)
img = Image.open("./imgs/2.png")
im = T(img).unsqueeze(0).to(device)
with torch.no_grad():
img_feat = model.encode_image(im).float()
encode_txt
def encoder_text(text):
text_inputs = clip.tokenize(f"An image depicting a {text}").to(device)
with torch.no_grad():
text_feature = model.encode_text(text_inputs).float()
# text_feature /= text_feature.norm(dim=-1, keepdim=True)
return text_feature
txt_feat = encoder_text('Computer')
similarity = criterion(img_feat, txt_feat)
print(similarity)
print(similarity.item())
`
I have same problem when I use CLIP for search