Using FaRL as FaceCLIP

import torch
import clip
from PIL import Image

device ="cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/16", device="cpu")
model = model.to(device)
farl_state=torch.load("FaRL-Base-Patch16-LAIONFace20M-ep16.pth") # you can download from <https://github.com/FacePerceiver/FaRL#pre-trained-backbones>
model.load_state_dict(farl_state["state_dict"],strict=False)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)

comments

facial attribute를 잘 표현하는 feature extractor로 사용하여 diet-nerf pipeline에 사용해볼만 하다고 생각.