the11 commited on
Commit
a704a0c
·
verified ·
1 Parent(s): 7158e39

Upload 9 files

Browse files
Files changed (9) hide show
  1. README.md +89 -7
  2. emotion.py +27 -0
  3. gitattributes +35 -0
  4. llm.py +32 -0
  5. main.py +7 -0
  6. rag.py +55 -0
  7. requirements.txt +11 -0
  8. tts_gemini.py +41 -0
  9. ui.py +83 -0
README.md CHANGED
@@ -1,12 +1,94 @@
1
  ---
2
- title: Voice Activated RAG System
3
- emoji: 🐢
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.36.2
8
- app_file: app.py
9
  pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Voice-Activated RAG System
3
+ emoji: 🗣️
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.34.2
8
+ app_file: main.py
9
  pinned: false
10
  ---
11
+ # Voice to LLM & Sentiment Analyzer with RAG
12
 
13
+ ## Overview
14
+ This project is a voice-driven application that integrates various machine learning models and APIs to provide sentiment analysis and context retrieval from uploaded PDF documents. It utilizes voice input to query the system, processes the audio using Whisper for transcription, retrieves relevant context from the uploaded PDFs using a Retrieval-Augmented Generation (RAG) approach, and provides a spoken audio output of the LLM response using Gemini TTS (Google GenAI API).
15
+
16
+ ## Features
17
+ - Voice input for user queries
18
+ - PDF document processing and context retrieval
19
+ - Sentiment analysis using a pre-trained emotion classification model
20
+ - Text-to-Speech output for the model's responses (using Gemini TTS)
21
+ - Integration with the Groq API for advanced language model capabilities
22
+
23
+ ## Modular Structure
24
+
25
+ This project is organized into modular components for maintainability and clarity:
26
+
27
+ - `main.py`: Entry point to run the app.
28
+ - `ui.py`: Gradio UI layout and event wiring.
29
+ - `tts_gemini.py`: Gemini TTS logic (text-to-speech).
30
+ - `emotion.py`: Emotion detection and tone mapping.
31
+ - `rag.py`: PDF processing, chunking, embedding, FAISS, and context retrieval.
32
+ - `llm.py`: LLM prompt construction and response logic.
33
+
34
+ ## Setup Instructions
35
+ 1. Clone the repository:
36
+ ```
37
+ git clone <repository-url>
38
+ cd GoComet-C4
39
+ ```
40
+
41
+ 2. Run the setup script to create a virtual environment and install dependencies (Linux/macOS):
42
+ ```
43
+ bash setup.sh
44
+ ```
45
+ On Windows, run each command in `setup.sh` manually in your terminal:
46
+ ```
47
+ python -m venv venv
48
+ venv\Scripts\activate
49
+ pip install -r requirements.txt
50
+ set GROQ_API_KEY="<your-groq-api-key>"
51
+ set GEMINI_API_KEY="<your-gemini-api-key>"
52
+ ```
53
+
54
+ 3. Activate the virtual environment:
55
+ - Linux/macOS:
56
+ ```
57
+ source venv/bin/activate
58
+ ```
59
+ - Windows:
60
+ ```
61
+ venv\Scripts\activate
62
+ ```
63
+
64
+ ## Usage
65
+ To run the application, execute the following command:
66
+ ```
67
+ python main.py
68
+ ```
69
+
70
+ Once the application is running, you can upload PDF files and use the microphone to speak your queries. The application will process the audio, retrieve context from the PDFs, analyze sentiment, and provide the LLM output, sentiment, transcript, context, and a spoken audio response (.wav) in the interface. TTS audio files will be saved in a `tts_outputs` directory in your project root.
71
+
72
+ ## Dependencies
73
+ This project requires the following Python libraries:
74
+ - gradio
75
+ - whisper
76
+ - groq
77
+ - transformers
78
+ - PyPDF2
79
+ - sentence-transformers
80
+ - faiss-cpu
81
+ - soundfile
82
+ - numpy
83
+ - google-genai
84
+
85
+ Install these dependencies using the `requirements.txt` file provided in the project.
86
+
87
+ ## Latency Logging
88
+
89
+ After each run, the latency (processing time in seconds) for each pipeline component is logged in `logs/latency_log.csv`:
90
+
91
+ | Whisper STT (s) | Document Retrieval (s) | Sentiment Analysis (s) | Response Gen (LLM) (s) | TTS Synthesis (s) | Total (s) |
92
+ |-----------------|-----------------------|-----------------------|------------------------|-------------------|-----------|
93
+
94
+ This file accumulates results from all runs, allowing you to analyze and monitor performance over time.
emotion.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ # Emotion to tone instruction mapping
4
+ emotion_tone_map = {
5
+ "Sadness": "Be comforting, empathetic, and gentle.",
6
+ "Anger": "Stay calm, respectful, and de-escalate.",
7
+ "Love": "Be warm, appreciative, and encouraging.",
8
+ "Surprise": "Be affirming and help clarify what's surprising.",
9
+ "Fear": "Be reassuring and emphasize safety/facts.",
10
+ "Happiness": "Be enthusiastic and congratulatory.",
11
+ "Neutral": "Be informative and straightforward.",
12
+ "Disgust": "Be clinical, non-judgmental, and clarify facts.",
13
+ "Shame": "Be kind, avoid blame, and uplift the user.",
14
+ "Guilt": "Be compassionate and reduce self-blame.",
15
+ "Confusion": "Be extra clear and explain step-by-step.",
16
+ "Desire": "Be supportive and help guide constructively.",
17
+ "Sarcasm": "Stay serious, clarify misunderstandings politely.",
18
+ }
19
+
20
+ emotion_classifier = pipeline("text-classification", model="boltuix/bert-emotion")
21
+
22
+ def get_emotion_and_tone(text):
23
+ emotions = emotion_classifier(text)
24
+ detected_emotion = emotions[0]["label"].capitalize() if emotions else "Neutral"
25
+ emotion = detected_emotion if detected_emotion in emotion_tone_map else "Neutral"
26
+ tone_instruction = emotion_tone_map.get(emotion, "Be informative and polite.")
27
+ return emotion, tone_instruction
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
llm.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from groq import Groq
3
+
4
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
5
+ if not GROQ_API_KEY:
6
+ raise RuntimeError("GROQ_API_KEY environment variable not set. Please set it to your Groq API key.")
7
+
8
+ groq_client = Groq(api_key=GROQ_API_KEY)
9
+
10
+ def get_llm_response(user_text, context, emotion, tone_instruction):
11
+ prompt = f"""
12
+ You are a helpful and emotionally aware assistant.
13
+ The user's emotional state is: {emotion}.
14
+
15
+ {tone_instruction}
16
+
17
+ Using the following context, answer the user's question:
18
+ ---
19
+ {context}
20
+ ---
21
+ Question: {user_text}
22
+ """
23
+ completion = groq_client.chat.completions.create(
24
+ model="meta-llama/llama-4-scout-17b-16e-instruct",
25
+ messages=[{"role": "user", "content": prompt}],
26
+ temperature=1,
27
+ max_completion_tokens=1024,
28
+ top_p=1,
29
+ stream=False,
30
+ stop=None,
31
+ )
32
+ return completion.choices[0].message.content
main.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from ui import demo
2
+
3
+ def main():
4
+ demo.launch()
5
+
6
+ if __name__ == "__main__":
7
+ main()
rag.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ import numpy as np
6
+ from rapidfuzz import fuzz
7
+
8
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
9
+ faiss_index = None
10
+ pdf_chunks = []
11
+ chunk_texts = []
12
+
13
+ def process_pdfs(pdf_files):
14
+ global faiss_index, pdf_chunks, chunk_texts
15
+ all_text = ""
16
+ chunk_texts = []
17
+ for pdf_file in pdf_files:
18
+ reader = PyPDF2.PdfReader(pdf_file.name)
19
+ for page in reader.pages:
20
+ all_text += page.extract_text() + "\n"
21
+ chunk_size = 500
22
+ pdf_chunks = [all_text[i:i+chunk_size] for i in range(0, len(all_text), chunk_size)]
23
+ chunk_texts = pdf_chunks
24
+ embeddings = embedder.encode(pdf_chunks, convert_to_numpy=True)
25
+ dim = embeddings.shape[1]
26
+ faiss_index = faiss.IndexFlatL2(dim)
27
+ faiss_index.add(embeddings)
28
+ return f"Processed {len(pdf_chunks)} chunks from {len(pdf_files)} PDF(s)."
29
+
30
+ def semantic_search(query, top_k=3):
31
+ global faiss_index, chunk_texts
32
+ if faiss_index is None or not chunk_texts:
33
+ return []
34
+ query_emb = embedder.encode([query], convert_to_numpy=True)
35
+ D, I = faiss_index.search(query_emb, top_k)
36
+ return [chunk_texts[i] for i in I[0] if i < len(chunk_texts)]
37
+
38
+ def keyword_search(query, top_k=3):
39
+ global chunk_texts
40
+ if not chunk_texts:
41
+ return []
42
+ scored = [(chunk, fuzz.partial_ratio(query.lower(), chunk.lower())) for chunk in chunk_texts]
43
+ scored = sorted(scored, key=lambda x: x[1], reverse=True)
44
+ return [chunk for chunk, score in scored[:top_k]]
45
+
46
+ def retrieve_context(query, top_k=3):
47
+ semantic_results = semantic_search(query, top_k)
48
+ keyword_results = keyword_search(query, top_k)
49
+ combined = []
50
+ seen = set()
51
+ for chunk in semantic_results + keyword_results:
52
+ if chunk not in seen:
53
+ combined.append(chunk)
54
+ seen.add(chunk)
55
+ return "\n".join(combined)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ groq
3
+ transformers
4
+ PyPDF2
5
+ sentence-transformers
6
+ faiss-cpu
7
+ soundfile
8
+ numpy
9
+ google-genai
10
+ rapidfuzz
11
+ openai-whisper
tts_gemini.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import wave
3
+ import uuid
4
+ from google import genai
5
+ from google.genai import types
6
+
7
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
8
+
9
+ def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
10
+ with wave.open(filename, "wb") as wf:
11
+ wf.setnchannels(channels)
12
+ wf.setsampwidth(sample_width)
13
+ wf.setframerate(rate)
14
+ wf.writeframes(pcm)
15
+
16
+ def tts_gemini(text, api_key=GEMINI_API_KEY):
17
+ output_dir = os.path.join(os.getcwd(), "tts_outputs")
18
+ os.makedirs(output_dir, exist_ok=True)
19
+ file_name = os.path.join(output_dir, f"tts_{uuid.uuid4().hex}.wav")
20
+ try:
21
+ client = genai.Client(api_key=api_key)
22
+ response = client.models.generate_content(
23
+ model="gemini-2.5-flash-preview-tts",
24
+ contents=text,
25
+ config=types.GenerateContentConfig(
26
+ response_modalities=["AUDIO"],
27
+ speech_config=types.SpeechConfig(
28
+ voice_config=types.VoiceConfig(
29
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
30
+ voice_name='Kore',
31
+ )
32
+ )
33
+ ),
34
+ )
35
+ )
36
+ data = response.candidates[0].content.parts[0].inline_data.data
37
+ wave_file(file_name, data)
38
+ return file_name
39
+ except Exception as e:
40
+ print(f"[Gemini TTS ERROR] {e}")
41
+ return None
ui.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag import process_pdfs, retrieve_context
3
+ from emotion import get_emotion_and_tone
4
+ from llm import get_llm_response
5
+ from tts_gemini import tts_gemini
6
+ import whisper
7
+ import time
8
+ import csv
9
+ import os
10
+
11
+ # Load Whisper model once
12
+ whisper_model = whisper.load_model("base")
13
+
14
+ LOG_DIR = "logs"
15
+ LOG_FILE = os.path.join(LOG_DIR, "latency_log.csv")
16
+
17
+ # Ensure log directory and CSV header
18
+ os.makedirs(LOG_DIR, exist_ok=True)
19
+ if not os.path.exists(LOG_FILE):
20
+ with open(LOG_FILE, mode="w", newline="") as f:
21
+ writer = csv.writer(f)
22
+ writer.writerow([
23
+ "Whisper STT (s)", "Document Retrieval (s)", "Sentiment Analysis (s)", "Response Gen (LLM) (s)", "TTS Synthesis (s)", "Total (s)"
24
+ ])
25
+
26
+ def process_audio_with_rag(audio):
27
+ t0 = time.time()
28
+ stt_start = time.time()
29
+ result = whisper_model.transcribe(audio)
30
+ text = result["text"]
31
+ stt_end = time.time()
32
+ retrieval_start = time.time()
33
+ context = retrieve_context(text)
34
+ retrieval_end = time.time()
35
+ sentiment_start = time.time()
36
+ emotion, tone_instruction = get_emotion_and_tone(text)
37
+ sentiment_end = time.time()
38
+ llm_start = time.time()
39
+ llm_output = get_llm_response(text, context, emotion, tone_instruction)
40
+ llm_end = time.time()
41
+ tts_start = time.time()
42
+ tts_path = tts_gemini(llm_output)
43
+ tts_end = time.time()
44
+ t1 = time.time()
45
+ stt_latency = stt_end - stt_start
46
+ retrieval_latency = retrieval_end - retrieval_start
47
+ sentiment_latency = sentiment_end - sentiment_start
48
+ llm_latency = llm_end - llm_start
49
+ tts_latency = tts_end - tts_start
50
+ total_latency = t1 - t0
51
+ # Log to CSV (latency only)
52
+ with open(LOG_FILE, mode="a", newline="") as f:
53
+ writer = csv.writer(f)
54
+ writer.writerow([
55
+ f"{stt_latency:.3f}",
56
+ f"{retrieval_latency:.3f}",
57
+ f"{sentiment_latency:.3f}",
58
+ f"{llm_latency:.3f}",
59
+ f"{tts_latency:.3f}",
60
+ f"{total_latency:.3f}"
61
+ ])
62
+ return llm_output, emotion, text, context, tts_path
63
+
64
+ demo = gr.Blocks()
65
+ with demo:
66
+ gr.Markdown("""
67
+ # Voice to LLM & Sentiment Analyzer with RAG (PDF Upload & Gemini TTS)
68
+ """)
69
+ with gr.Row():
70
+ pdf_input = gr.Files(label="Upload PDF(s)", type="filepath")
71
+ pdf_status = gr.Textbox(label="PDF Processing Status")
72
+ pdf_input.upload(process_pdfs, inputs=pdf_input, outputs=pdf_status)
73
+ audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak your query")
74
+ llm_output = gr.Textbox(label="LLM Output")
75
+ sentiment_output = gr.Textbox(label="Sentiment")
76
+ transcript_output = gr.Textbox(label="Transcribed Text")
77
+ context_output = gr.Textbox(label="Retrieved Context from PDFs")
78
+ tts_output = gr.Audio(label="LLM Output (Gemini TTS)")
79
+ audio_input.change(
80
+ process_audio_with_rag,
81
+ inputs=audio_input,
82
+ outputs=[llm_output, sentiment_output, transcript_output, context_output, tts_output]
83
+ )