USAAIO
1
Part 11 (5 points, coding task)
In this part, you are asked to build your CLIP model.
USAAIO
2
### WRITE YOUR SOLUTION HERE ###
class MyCLIP(nn.Module):
def __init__(self, embedding_size = 512):
super().__init__()
self.model_image = ViTModel.from_pretrained('google/vit-base-patch16-224')
self.model_text = BertModel.from_pretrained('bert-base-uncased')
self.embedding_size = embedding_size
self.last_layer_image = nn.Linear(768, self.embedding_size)
self.last_layer_text = nn.Linear(768, self.embedding_size)
self.log_tau = nn.Parameter(torch.randn(1))
def encoder_image(self, image_batch):
image_embedding = self.model_image(image_batch)['last_hidden_state'][:,0]
image_embedding = self.last_layer_image(image_embedding)
return image_embedding
def encoder_text(self, token_id_batch, attention_mask_batch):
text_embedding = self.model_text(input_ids = token_id_batch, attention_mask = attention_mask_batch)['last_hidden_state'][:,0]
text_embedding = self.last_layer_text(text_embedding)
return text_embedding
def forward(self, image_batch, token_id_batch, attention_mask_batch):
return self.encoder_image(image_batch), self.encoder_text(token_id_batch, attention_mask_batch)
""" END OF THIS PART """
1 Like