Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,28 +1,48 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
4 |
from nltk.corpus import stopwords
|
5 |
-
|
6 |
-
import
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
#
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
17 |
-
# Transcription function (mocked for this example)
|
18 |
def transcribe_audio(file_path):
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
# Function to extract key sentences
|
24 |
def extract_key_sentences(transcript):
|
25 |
-
|
26 |
sentences = sent_tokenize(transcript)
|
27 |
important_sentences = [
|
28 |
sentence for sentence in sentences
|
@@ -30,43 +50,39 @@ def extract_key_sentences(transcript):
|
|
30 |
]
|
31 |
return important_sentences
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def generate_notes(transcript):
|
|
|
|
|
35 |
key_sentences = extract_key_sentences(transcript)
|
36 |
|
37 |
-
#
|
38 |
-
|
39 |
-
tfidf_matrix = vectorizer.fit_transform(key_sentences)
|
40 |
-
scores = tfidf_matrix.sum(axis=1).A1
|
41 |
-
scored_sentences = sorted(zip(scores, key_sentences), reverse=True)
|
42 |
-
|
43 |
-
# Generating notes as a mix of important sentences
|
44 |
-
long_questions = scored_sentences[:3] # Take top 3 for long questions
|
45 |
-
short_questions = scored_sentences[3:6] # Next 3 for short questions
|
46 |
-
mcqs = scored_sentences[6:9] # Following 3 for MCQs
|
47 |
|
|
|
48 |
notes = {
|
49 |
-
"
|
50 |
-
"
|
51 |
-
"
|
52 |
}
|
53 |
|
54 |
return notes
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
transcript = transcribe_audio(file.name)
|
59 |
-
notes = generate_notes(transcript)
|
60 |
-
return notes
|
61 |
-
|
62 |
-
# Gradio UI setup
|
63 |
-
iface = gr.Interface(
|
64 |
-
fn=transcribe,
|
65 |
-
inputs="file",
|
66 |
-
outputs="json",
|
67 |
-
title="Audio to Study Notes",
|
68 |
-
description="Transcribe audio to extract key sentences for study notes, including Long Questions, Short Questions, and MCQs."
|
69 |
-
)
|
70 |
-
|
71 |
-
# Run the app
|
72 |
-
iface.launch()
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
4 |
from nltk.corpus import stopwords
|
5 |
+
import nltk
|
6 |
+
from flask import Flask, request, jsonify
|
7 |
+
|
8 |
+
# Download NLTK data
|
9 |
+
nltk.download("punkt")
|
10 |
+
nltk.download("stopwords")
|
11 |
+
|
12 |
+
# Initialize stop words
|
13 |
+
stop_words = set(stopwords.words("english"))
|
14 |
|
15 |
+
# Initialize Flask app
|
16 |
+
app = Flask(__name__)
|
17 |
+
|
18 |
+
# Groq API credentials and endpoints
|
19 |
+
GROQ_API_KEY = "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"
|
20 |
+
TRANSCRIBE_ENDPOINT = "https://api.groq.com/transcribe" # Replace with actual endpoint
|
21 |
+
KEYWORD_EXTRACTION_ENDPOINT = "https://api.groq.com/keywords" # Replace with actual endpoint
|
22 |
|
|
|
23 |
def transcribe_audio(file_path):
|
24 |
+
"""Send audio file to Groq's transcription API."""
|
25 |
+
with open(file_path, "rb") as audio_file:
|
26 |
+
response = requests.post(
|
27 |
+
TRANSCRIBE_ENDPOINT,
|
28 |
+
headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
|
29 |
+
files={"file": audio_file}
|
30 |
+
)
|
31 |
+
response.raise_for_status()
|
32 |
+
return response.json()["transcript"]
|
33 |
+
|
34 |
+
def extract_keywords(text):
|
35 |
+
"""Send text to Groq's keyword extraction API."""
|
36 |
+
response = requests.post(
|
37 |
+
KEYWORD_EXTRACTION_ENDPOINT,
|
38 |
+
headers={"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"},
|
39 |
+
json={"text": text}
|
40 |
+
)
|
41 |
+
response.raise_for_status()
|
42 |
+
return response.json()["keywords"]
|
43 |
|
|
|
44 |
def extract_key_sentences(transcript):
|
45 |
+
"""Extract sentences containing keywords from the transcript."""
|
46 |
sentences = sent_tokenize(transcript)
|
47 |
important_sentences = [
|
48 |
sentence for sentence in sentences
|
|
|
50 |
]
|
51 |
return important_sentences
|
52 |
|
53 |
+
@app.route("/transcribe", methods=["POST"])
|
54 |
+
def transcribe():
|
55 |
+
"""API endpoint to transcribe audio and generate notes."""
|
56 |
+
if "file" not in request.files:
|
57 |
+
return jsonify({"error": "No file uploaded"}), 400
|
58 |
+
file = request.files["file"]
|
59 |
+
file_path = "/tmp/audio_file.wav"
|
60 |
+
file.save(file_path)
|
61 |
+
|
62 |
+
# Transcribe the audio
|
63 |
+
transcript = transcribe_audio(file_path)
|
64 |
+
|
65 |
+
# Generate notes
|
66 |
+
notes = generate_notes(transcript)
|
67 |
+
|
68 |
+
return jsonify(notes)
|
69 |
+
|
70 |
def generate_notes(transcript):
|
71 |
+
"""Generate summarized notes based on keywords and important sentences."""
|
72 |
+
# Extract key sentences
|
73 |
key_sentences = extract_key_sentences(transcript)
|
74 |
|
75 |
+
# Extract keywords
|
76 |
+
keywords = extract_keywords(transcript)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
# Prepare notes
|
79 |
notes = {
|
80 |
+
"short_questions": keywords[:5], # Select top 5 keywords as short questions
|
81 |
+
"long_questions": key_sentences[:3], # Select first 3 key sentences for long questions
|
82 |
+
"mcq": [{"question": f"What is {kw}?", "answer": "Yes/No"} for kw in keywords[:3]]
|
83 |
}
|
84 |
|
85 |
return notes
|
86 |
|
87 |
+
if __name__ == "__main__":
|
88 |
+
app.run(debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|