From 578a1d3e2e8840ec7b9e825797780e44d904a669 Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Sat, 30 Jan 2021 00:22:03 +0900
Subject: [PATCH] moved the notebook to subfolder

---
 README.md                                                     | 2 +-
 model.py                                                      | 4 ++--
 .../Interacting_with_CLIP.ipynb                               | 0
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename Interacting_with_CLIP.ipynb => notebooks/Interacting_with_CLIP.ipynb (100%)

diff --git a/README.md b/README.md
index d7eaf7a..0a9cb4f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # CLIP
 
-[[Blog]](https://openai.com/blog/clip/) [[Paper]](https://cdn.openai.com/papers/Learning_Transferable_Visual_Models_From_Natural_Language_Supervision.pdf) [[Model Card]](model-card.md) [[Colab]](https://colab.research.google.com/github/openai/clip/blob/master/Interacting_with_CLIP.ipynb)
+[[Blog]](https://openai.com/blog/clip/) [[Paper]](https://cdn.openai.com/papers/Learning_Transferable_Visual_Models_From_Natural_Language_Supervision.pdf) [[Model Card]](model-card.md) [[Colab]](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb)
 
 CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task, similarly to the zero-shot capabilities of GPT-2 and 3. We found CLIP matches the performance of the original ResNet50 on ImageNet “zero-shot” without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision.
 
diff --git a/model.py b/model.py
index 8badb20..cbfc09c 100644
--- a/model.py
+++ b/model.py
@@ -329,11 +329,11 @@ class CLIP(nn.Module):
 
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
-        logits_per_iamge = logit_scale * image_features @ text_features.t()
+        logits_per_image = logit_scale * image_features @ text_features.t()
         logits_per_text = logit_scale * text_features @ image_features.t()
 
         # shape = [global_batch_size, global_batch_size]
-        return logits_per_iamge, logits_per_text
+        return logits_per_image, logits_per_text
 
 
 def convert_weights(model: nn.Module):
diff --git a/Interacting_with_CLIP.ipynb b/notebooks/Interacting_with_CLIP.ipynb
similarity index 100%
rename from Interacting_with_CLIP.ipynb
rename to notebooks/Interacting_with_CLIP.ipynb