vlpers/third_party/patches/palavra.patch

diff --git a/caption_retrival_eval.py b/palavra/caption_retrival_eval.py
similarity index 96%
rename from caption_retrival_eval.py
rename to palavra/caption_retrival_eval.py
index eb4aa05..87a719b 100644
--- a/caption_retrival_eval.py
+++ b/palavra/caption_retrival_eval.py
@@ -1,3 +1,9 @@
+# ---------------------------------------------------------------
+# This file has been modified from the file in the following repository:
+# https://github.com/NVlabs/PALAVRA
+# See below for the original license.
+# ---------------------------------------------------------------
+
 # Copyright (C) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # This work is licensed under the LICENSE file 
@@ -10,19 +16,19 @@ import sys
 from torch.utils.data import DataLoader
 sys.path.append("../")
 sys.path.append("clip_language_set")
-from utils.nv import ClipEvalutionEncodeData, TextVisualMap
-from utils.deep_set_clf import D as deep_set
+from palavra.utils.nv import ClipEvalutionEncodeData, TextVisualMap
+from palavra.utils.deep_set_clf import D as deep_set
 import torch.nn.functional as F
 from dataclasses import dataclass
 from simple_parsing import ArgumentParser
 import time
 import os
-from get_f_theta import HParams
+from palavra.get_f_theta import HParams
 import faiss
-from fsl_eval import infer_tokens_from_f_theta, optimize_token, parse_coarse_grained_strings
+from palavra.fsl_eval import infer_tokens_from_f_theta, optimize_token, parse_coarse_grained_strings
 import pandas as pd
 import wandb
-from utils.non_nv import encode_text_with_learnt_tokens
+from palavra.utils.non_nv import encode_text_with_learnt_tokens
 
 from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
 try:
@@ -328,7 +334,7 @@ def main(args_path = None, is_wandb_init = True):
         mlp_model = None
 
         txt_vis_model_name = "txt_vis_model_%s_%s"%(train_args.save_model_name, args.evalparams.model_name)
-        text_vis_model = TextVisualMap()
+        text_vis_model = TextVisualMap(512)
         text_vis_model.to(device)
         text_vis_model.load_state_dict(torch.load(os.path.join(args.evalparams.model_path,txt_vis_model_name)))
 
@@ -379,6 +385,7 @@ def main(args_path = None, is_wandb_init = True):
                                                            seed=args.evalparams.random_seed)
                 train_dataloader = DataLoader(train_dataset, batch_size=args.evalparams.batch_size, shuffle=False)
             gt_text_label_parsed = parse_coarse_grained_strings(args, gt_text_coarse)
+            args.personalize_token_images = "all"
             object_tokens = optimize_token(args, object_tokens, model, train_dataloader, gt_text_label_parsed, text_vis_model)
             print("object_tokens2", object_tokens)
 
diff --git a/fsl_eval.py b/palavra/fsl_eval.py
similarity index 93%
rename from fsl_eval.py
rename to palavra/fsl_eval.py
index 5d01e60..ff05f10 100644
--- a/fsl_eval.py
+++ b/palavra/fsl_eval.py
@@ -1,3 +1,9 @@
+# ---------------------------------------------------------------
+# This file has been modified from the file in the following repository:
+# https://github.com/NVlabs/PALAVRA
+# See below for the original license.
+# ---------------------------------------------------------------
+
 # Copyright (C) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # This work is licensed under the LICENSE file 
@@ -12,7 +18,7 @@ sys.path.append("clip_language_set")
 import torch.optim as optim
 import torch.nn.functional as F
 import re
-from utils.nv import l2_norm_loss, contrastive_loss
+from palavra.utils.nv import l2_norm_loss, contrastive_loss
 
 
 emb_dim: int = 512
@@ -39,9 +45,11 @@ def optimize_token(args, object_tokens, model, train_dataloader, gt_text_label,
     if (args.evalparams.token_optimize_mode == 1):
         with torch.no_grad():
             if args.evalparams.is_coarse_grained_negative_per_class:
-                print("gt_text_label",gt_text_label)
+                pass
+                # print("gt_text_label",gt_text_label)
             else:
                 gt_text_label = [args.evalparams.per_dataset_coarse_grained_phrase for i in range(len(gt_text_label))]
+                # print("gt_text_label",gt_text_label)
 
             coarse_grained_embeddings = get_clip_embedding_from_tokens(args, model, gt_text_label = gt_text_label)
             coarse_grained_embeddings = F.normalize(coarse_grained_embeddings, dim=-1)
@@ -121,6 +129,8 @@ def optimize_trainable_token(args, object_tokens, asterix_token, prompt, model,
         for batch_num, sample in enumerate(train_dataloader):
             optimizer.zero_grad()
             images, label = sample
+            if args.personalize_token_images == "random5":
+                images = images[:, torch.randperm(images.shape[1])[:5]]
 
             image_features_list = []
 
diff --git a/get_f_theta.py b/palavra/get_f_theta.py
similarity index 96%
rename from get_f_theta.py
rename to palavra/get_f_theta.py
index 4d52278..e33434b 100644
--- a/get_f_theta.py
+++ b/palavra/get_f_theta.py
@@ -1,3 +1,9 @@
+# ---------------------------------------------------------------
+# This file has been modified from the file in the following repository:
+# https://github.com/NVlabs/PALAVRA
+# See below for the original license.
+# ---------------------------------------------------------------
+
 # Copyright (C) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # This work is licensed under the LICENSE file 
@@ -10,8 +16,8 @@ from sklearn.model_selection import train_test_split
 import sys
 from torch.utils.data import DataLoader
 sys.path.append("../")
-from utils.non_nv import encode_text_with_learnt_tokens
-from utils.deep_set_clf import D as deep_set
+from palavra.utils.non_nv import encode_text_with_learnt_tokens
+from palavra.utils.deep_set_clf import D as deep_set
 import torch.optim as optim
 from torch.optim.lr_scheduler import StepLR
 import torch.nn.functional as F
@@ -20,10 +26,10 @@ from simple_parsing import ArgumentParser
 import wandb
 import time
 import os
-from utils.nv import TextVisualMap, TextVisualMapAbl, natural_prompt_multi, CustomTextDataset, l2_norm_loss, cosine_loss, MLP, contrastive_loss
+from palavra.utils.nv import TextVisualMap, TextVisualMapAbl, natural_prompt_multi, CustomTextDataset, l2_norm_loss, cosine_loss, MLP, contrastive_loss
 import random
 
-emb_dim: int = 512
+emb_dim: int = 768
 natural_prompt_multi = natural_prompt_multi
 num_tokens = 77
 
@@ -278,7 +284,7 @@ def main():
 
     #Deep set model
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model, preprocess = clip.load("ViT-B/32", device=device)
+    model, preprocess = clip.load("ViT-L/14", device=device)
 
     #Add personalized text encoder method to CLIP
     funcType = type(model.encode_text)
diff --git a/utils/deep_set_clf.py b/palavra/utils/deep_set_clf.py
similarity index 95%
rename from utils/deep_set_clf.py
rename to palavra/utils/deep_set_clf.py
index ac9051e..e0933e8 100644
--- a/utils/deep_set_clf.py
+++ b/palavra/utils/deep_set_clf.py
@@ -1,4 +1,10 @@
 # ---------------------------------------------------------------
+# This file has been modified from the file in the following repository:
+# https://github.com/NVlabs/PALAVRA
+# See below for the original license.
+# ---------------------------------------------------------------
+#
+# ---------------------------------------------------------------
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # This file has been modified from the file in the following repository:
diff --git a/utils/non_nv.py b/palavra/utils/non_nv.py
similarity index 100%
rename from utils/non_nv.py
rename to palavra/utils/non_nv.py
diff --git a/utils/nv.py b/palavra/utils/nv.py
similarity index 93%
rename from utils/nv.py
rename to palavra/utils/nv.py
index 70c48aa..52a31bd 100644
--- a/utils/nv.py
+++ b/palavra/utils/nv.py
@@ -1,3 +1,9 @@
+# ---------------------------------------------------------------
+# This file has been modified from the file in the following repository:
+# https://github.com/NVlabs/PALAVRA
+# See below for the original license.
+# ---------------------------------------------------------------
+# 
 # Copyright (C) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # This work is licensed under the LICENSE file 
@@ -6,7 +12,7 @@
 import os
 import numpy as np
 from torch.utils.data import Dataset
-from utils.non_nv import temporary_random_numpy_seed
+from palavra.utils.non_nv import temporary_random_numpy_seed
 import torchvision
 import torch
 import torch.nn as nn
@@ -98,20 +104,20 @@ class ClipEvalutionEncodeData(Dataset):
 
 class TextVisualMap(nn.Module):
 
-  def __init__(self):
+  def __init__(self, emb_dim):
     super().__init__()
     self.layers = nn.Sequential(
-      nn.Linear(512, 512),
+      nn.Linear(emb_dim, emb_dim),
     )
   def forward(self, x):
     return self.layers(x) + x
 
 class TextVisualMapAbl(nn.Module):
 
-  def __init__(self):
+  def __init__(self, emb_dim):
     super().__init__()
     self.layers = nn.Sequential(
-      nn.Linear(512, 512),
+      nn.Linear(emb_dim, emb_dim),
     )
   def forward(self, x):
     return x
@@ -133,15 +139,15 @@ class LinearEmb(nn.Module):
 
 class MLP(nn.Module):
   #No license needed, can be moved to nv.py?
-  def __init__(self, dropout = 0.5):
+  def __init__(self, dropout = 0.5, emb_dim = None):
     super().__init__()
     self.layers = nn.Sequential(
       nn.Flatten(),
       nn.Dropout(p=dropout),
-      nn.Linear(512, 512),
+      nn.Linear(emb_dim, emb_dim),
       nn.ReLU(),
       nn.Dropout(p=dropout),
-      nn.Linear(512, 512),
+      nn.Linear(emb_dim, emb_dim),
     )
 
   def forward(self, x):
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..ca61b93
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,3 @@
+from setuptools import setup, find_packages
+
+setup(name='palavra', version='1.0', packages=find_packages())
\ No newline at end of file