joy1515 commited on
Commit
5537a5d
·
verified ·
1 Parent(s): eac58f0

initial commit

Browse files
Files changed (2) hide show
  1. README.md +48 -14
  2. app.py +167 -0
README.md CHANGED
@@ -1,14 +1,48 @@
1
- ---
2
- title: Retrieval Ai
3
- emoji: 😻
4
- colorFrom: purple
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.15.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: A Multi-Modal Retrieval system
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Retrieval-AI
2
+
3
+ ## Overview
4
+ Retrieval-AI is a simple image retrieval application using a CLIP model and FAISS indexing. It allows users to search for images based on text queries and provides accessibility features such as dark mode and speech-to-text for visually impaired users.
5
+
6
+ ## Features
7
+ Text-based Image Search: Users can input a text query to find matching images.
8
+ Adjustable Results Count: Users can select how many results to display.
9
+ Example Queries: Predefined queries help users get started.
10
+ Dark Mode Support: Enhances usability in low-light conditions.
11
+ Speech-to-Text Input: Allows visually impaired users to speak their queries instead of typing.
12
+
13
+ ## Installation
14
+ ### Prerequisites
15
+ Python 3.8+
16
+ Required dependencies (see `requirements.txt` if available)
17
+
18
+ ### Setup
19
+ 1. Clone the repository:
20
+
21
+ git clone https://github.com/yourusername/retrieval-ai.git
22
+ cd retrieval-ai
23
+
24
+ 2. Install dependencies:
25
+
26
+ pip install -r requirements.txt
27
+
28
+ 3. Run the application:
29
+
30
+ python app.py
31
+
32
+ 4. Open the provided URL in a browser (e.g., `http://127.0.0.1:7860`).
33
+
34
+ ## Usage
35
+ 1. Enter a text query or use speech input.
36
+ 2. Adjust the number of results (1-10).
37
+ 3. View the matched images in the results gallery.
38
+
39
+ ## Accessibility Features
40
+ Dark Mode: Automatically adapts the UI for better readability in dark environments.
41
+ Speech-to-Text: Allows users to dictate their search queries for improved accessibility.
42
+
43
+ ## License
44
+ This project is open-source under the MIT License.
45
+
46
+ ## Contact
47
+ For any inquiries or contributions, please reach out to Joyce Nhlengetwa at [email protected].
48
+
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import CLIPProcessor, CLIPModel
4
+ import numpy as np
5
+ import kagglehub
6
+ from PIL import Image
7
+ import os
8
+ from pathlib import Path
9
+ import logging
10
+ import faiss
11
+ from tqdm import tqdm
12
+ import speech_recognition as sr
13
+ from gtts import gTTS
14
+ import tempfile
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
18
+ logger = logging.getLogger(__name__)
19
+
20
+ class ImageSearchSystem:
21
+ def __init__(self):
22
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ logger.info(f"Using device: {self.device}")
24
+
25
+ # Load CLIP model
26
+ self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
27
+ self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(self.device)
28
+
29
+ # Initialize dataset
30
+ self.image_paths = []
31
+ self.index = None
32
+ self.initialized = False
33
+
34
+ def initialize_dataset(self) -> None:
35
+ """Download and process dataset"""
36
+ try:
37
+ path = kagglehub.dataset_download("alessandrasala79/ai-vs-human-generated-dataset")
38
+ image_folder = os.path.join(path, 'test_data_v2')
39
+
40
+ self.image_paths = [
41
+ f for f in Path(image_folder).glob("**/*")
42
+ if f.suffix.lower() in ['.jpg', '.jpeg', '.png']
43
+ ]
44
+
45
+ if not self.image_paths:
46
+ raise ValueError(f"No images found in {image_folder}")
47
+
48
+ logger.info(f"Found {len(self.image_paths)} images")
49
+
50
+ self._create_image_index()
51
+ self.initialized = True
52
+
53
+ except Exception as e:
54
+ logger.error(f"Dataset initialization failed: {str(e)}")
55
+ raise
56
+
57
+ def _create_image_index(self, batch_size: int = 32) -> None:
58
+ """Create FAISS index"""
59
+ try:
60
+ all_features = []
61
+
62
+ for i in tqdm(range(0, len(self.image_paths), batch_size), desc="Indexing images"):
63
+ batch_paths = self.image_paths[i:i + batch_size]
64
+ batch_images = [Image.open(img).convert("RGB") for img in batch_paths]
65
+
66
+ if batch_images:
67
+ inputs = self.processor(images=batch_images, return_tensors="pt", padding=True)
68
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
69
+
70
+ with torch.no_grad():
71
+ features = self.model.get_image_features(**inputs)
72
+ features = features / features.norm(dim=-1, keepdim=True)
73
+ all_features.append(features.cpu().numpy())
74
+
75
+ all_features = np.concatenate(all_features, axis=0)
76
+ self.index = faiss.IndexFlatIP(all_features.shape[1])
77
+ self.index.add(all_features)
78
+
79
+ logger.info("Image index created successfully")
80
+
81
+ except Exception as e:
82
+ logger.error(f"Failed to create image index: {str(e)}")
83
+ raise
84
+
85
+ def search(self, query: str, audio_path: str = None, k: int = 5):
86
+ """Search for images using text or speech"""
87
+ try:
88
+ if not self.initialized:
89
+ raise RuntimeError("System not initialized. Call initialize_dataset() first.")
90
+
91
+ # Convert speech to text if audio input is provided
92
+ if audio_path:
93
+ recognizer = sr.Recognizer()
94
+ with sr.AudioFile(audio_path) as source:
95
+ audio_data = recognizer.record(source)
96
+ try:
97
+ query = recognizer.recognize_google(audio_data)
98
+ except sr.UnknownValueError:
99
+ return [], "Could not understand the spoken query.", None
100
+
101
+ # Process text query
102
+ inputs = self.processor(text=[query], return_tensors="pt", padding=True)
103
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
104
+
105
+ with torch.no_grad():
106
+ text_features = self.model.get_text_features(**inputs)
107
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
108
+
109
+ # Search FAISS index
110
+ scores, indices = self.index.search(text_features.cpu().numpy(), k)
111
+ results = [Image.open(self.image_paths[idx]) for idx in indices[0]]
112
+
113
+ # Generate Text-to-Speech
114
+ tts = gTTS(f"Showing results for {query}")
115
+ temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
116
+ tts.save(temp_audio.name)
117
+
118
+ return results, query, temp_audio.name
119
+
120
+ except Exception as e:
121
+ logger.error(f"Search failed: {str(e)}")
122
+ return [], "Error during search.", None
123
+
124
+ def create_demo_interface() -> gr.Interface:
125
+ """Create Gradio interface with dark mode & speech support"""
126
+ system = ImageSearchSystem()
127
+
128
+ try:
129
+ system.initialize_dataset()
130
+ except Exception as e:
131
+ logger.error(f"Failed to initialize system: {str(e)}")
132
+ raise
133
+
134
+ examples = [
135
+ ["a beautiful landscape with mountains"],
136
+ ["people working in an office"],
137
+ ["a cute dog playing"],
138
+ ["a modern city skyline at night"],
139
+ ["a delicious-looking meal"]
140
+ ]
141
+
142
+ return gr.Interface(
143
+ fn=system.search,
144
+ inputs=[
145
+ gr.Textbox(label="Enter your search query:", placeholder="Describe the image...", lines=2),
146
+ gr.Audio(source="microphone", type="filepath", label="Speak Your Query (Optional)")
147
+ ],
148
+ outputs=[
149
+ gr.Gallery(label="Search Results", show_label=True, columns=5, height="auto"),
150
+ gr.Textbox(label="Spoken Query", interactive=False),
151
+ gr.Audio(label="Results Spoken Out Loud")
152
+ ],
153
+ title="Multi-Modal Image Search",
154
+ description="Use text or voice to search for images.",
155
+ theme="dark",
156
+ examples=examples,
157
+ cache_examples=True,
158
+ css=".gradio-container {background-color: #121212; color: #ffffff;}"
159
+ )
160
+
161
+ if __name__ == "__main__":
162
+ try:
163
+ demo = create_demo_interface()
164
+ demo.launch(share=True, enable_queue=True, max_threads=40)
165
+ except Exception as e:
166
+ logger.error(f"Failed to launch app: {str(e)}")
167
+ raise