recognize-anything

Running

App Files Files Community

xinyu1205 commited on Apr 11, 2023

Commit

f623175

1 Parent(s): a71d73e

Update models/tag2text.py

Browse files

Files changed (1) hide show

models/tag2text.py +14 -2

models/tag2text.py CHANGED Viewed

@@ -26,7 +26,14 @@ def read_json(rpath):
     with open(rpath, 'r') as f:
         return json.load(f)
 delete_tag_index = [127,2961, 3351, 3265, 3338, 3355, 3359]
 class Tag2Text_Caption(nn.Module):
     def __init__(self,
@@ -36,7 +43,7 @@ class Tag2Text_Caption(nn.Module):
                  vit_grad_ckpt = False,
                  vit_ckpt_layer = 0,
                  prompt = 'a picture of ',
-                 threshold = 0.7,
                  ):
         """
         Args:
@@ -105,6 +112,10 @@ class Tag2Text_Caption(nn.Module):
         tie_encoder_decoder_weights(self.tag_encoder,self.vision_multi,'',' ')
         self.tag_array = tra_array
     def del_selfattention(self):
         del self.vision_multi.embeddings
         for layer in self.vision_multi.encoder.layer:
@@ -130,7 +141,8 @@ class Tag2Text_Caption(nn.Module):
             logits = self.fc(mlr_tagembedding[0])
-            targets = torch.where(torch.sigmoid(logits) > self.threshold , torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device))
             tag = targets.cpu().numpy()
             tag[:,delete_tag_index] = 0

     with open(rpath, 'r') as f:
         return json.load(f)
+# delete some tags that may disturb captioning
+# 127: "quarter"; 2961: "back"; 3351: "two"; 3265: "three"; 3338: "four"; 3355: "five"; 3359: "one"
 delete_tag_index = [127,2961, 3351, 3265, 3338, 3355, 3359]
+# adjust thresholds for some tags
+# default threshold: 0.68
+# 2701: "person"; 2828: "man"; 1167: "woman";
+tag_thrshold = {2701:0.7, 2828: 0.7, 1167: 0.7}
 class Tag2Text_Caption(nn.Module):
     def __init__(self,
                  vit_grad_ckpt = False,
                  vit_ckpt_layer = 0,
                  prompt = 'a picture of ',
+                 threshold = 0.68,
                  ):
         """
         Args:
         tie_encoder_decoder_weights(self.tag_encoder,self.vision_multi,'',' ')
         self.tag_array = tra_array
+        self.class_threshold = torch.ones(self.num_class) * self.threshold
+        for key,value in tag_thrshold.items():
+            self.class_threshold[key] = value
     def del_selfattention(self):
         del self.vision_multi.embeddings
         for layer in self.vision_multi.encoder.layer:
             logits = self.fc(mlr_tagembedding[0])
+            # targets = torch.where(torch.sigmoid(logits) > self.threshold , torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device))
+            targets = torch.where(torch.sigmoid(logits) > self.class_threshold.to(image.device) , torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device))
             tag = targets.cpu().numpy()
             tag[:,delete_tag_index] = 0