lodhrangpt commited on
Commit
1507087
·
verified ·
1 Parent(s): 09023d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -49
app.py CHANGED
@@ -1,28 +1,48 @@
1
- import gradio as gr
2
- import nltk
3
  from nltk.tokenize import sent_tokenize, word_tokenize
4
  from nltk.corpus import stopwords
5
- from sklearn.feature_extraction.text import TfidfVectorizer
6
- import openai
7
- import datetime
 
 
 
 
 
 
8
 
9
- # Ensure necessary NLTK resources are downloaded
10
- try:
11
- nltk.data.find('tokenizers/punkt')
12
- nltk.data.find('corpora/stopwords')
13
- except LookupError:
14
- nltk.download('punkt')
15
- nltk.download('stopwords')
16
 
17
- # Transcription function (mocked for this example)
18
  def transcribe_audio(file_path):
19
- # Assume some transcription service is being used, and return text as output
20
- transcript = "This is a sample transcription of an audio file. It contains information that can be converted into important points for study notes."
21
- return transcript
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Function to extract key sentences
24
  def extract_key_sentences(transcript):
25
- stop_words = set(stopwords.words("english"))
26
  sentences = sent_tokenize(transcript)
27
  important_sentences = [
28
  sentence for sentence in sentences
@@ -30,43 +50,39 @@ def extract_key_sentences(transcript):
30
  ]
31
  return important_sentences
32
 
33
- # Function to generate study notes from the transcription
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def generate_notes(transcript):
 
 
35
  key_sentences = extract_key_sentences(transcript)
36
 
37
- # Using TfidfVectorizer for scoring and ranking sentences
38
- vectorizer = TfidfVectorizer(stop_words='english')
39
- tfidf_matrix = vectorizer.fit_transform(key_sentences)
40
- scores = tfidf_matrix.sum(axis=1).A1
41
- scored_sentences = sorted(zip(scores, key_sentences), reverse=True)
42
-
43
- # Generating notes as a mix of important sentences
44
- long_questions = scored_sentences[:3] # Take top 3 for long questions
45
- short_questions = scored_sentences[3:6] # Next 3 for short questions
46
- mcqs = scored_sentences[6:9] # Following 3 for MCQs
47
 
 
48
  notes = {
49
- "Long Questions": [sentence for _, sentence in long_questions],
50
- "Short Questions": [sentence for _, sentence in short_questions],
51
- "MCQs": [sentence for _, sentence in mcqs],
52
  }
53
 
54
  return notes
55
 
56
- # Main function for Gradio app
57
- def transcribe(file):
58
- transcript = transcribe_audio(file.name)
59
- notes = generate_notes(transcript)
60
- return notes
61
-
62
- # Gradio UI setup
63
- iface = gr.Interface(
64
- fn=transcribe,
65
- inputs="file",
66
- outputs="json",
67
- title="Audio to Study Notes",
68
- description="Transcribe audio to extract key sentences for study notes, including Long Questions, Short Questions, and MCQs."
69
- )
70
-
71
- # Run the app
72
- iface.launch()
 
1
+ import requests
2
+ import json
3
  from nltk.tokenize import sent_tokenize, word_tokenize
4
  from nltk.corpus import stopwords
5
+ import nltk
6
+ from flask import Flask, request, jsonify
7
+
8
+ # Download NLTK data
9
+ nltk.download("punkt")
10
+ nltk.download("stopwords")
11
+
12
+ # Initialize stop words
13
+ stop_words = set(stopwords.words("english"))
14
 
15
+ # Initialize Flask app
16
+ app = Flask(__name__)
17
+
18
+ # Groq API credentials and endpoints
19
+ GROQ_API_KEY = "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"
20
+ TRANSCRIBE_ENDPOINT = "https://api.groq.com/transcribe" # Replace with actual endpoint
21
+ KEYWORD_EXTRACTION_ENDPOINT = "https://api.groq.com/keywords" # Replace with actual endpoint
22
 
 
23
  def transcribe_audio(file_path):
24
+ """Send audio file to Groq's transcription API."""
25
+ with open(file_path, "rb") as audio_file:
26
+ response = requests.post(
27
+ TRANSCRIBE_ENDPOINT,
28
+ headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
29
+ files={"file": audio_file}
30
+ )
31
+ response.raise_for_status()
32
+ return response.json()["transcript"]
33
+
34
+ def extract_keywords(text):
35
+ """Send text to Groq's keyword extraction API."""
36
+ response = requests.post(
37
+ KEYWORD_EXTRACTION_ENDPOINT,
38
+ headers={"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"},
39
+ json={"text": text}
40
+ )
41
+ response.raise_for_status()
42
+ return response.json()["keywords"]
43
 
 
44
  def extract_key_sentences(transcript):
45
+ """Extract sentences containing keywords from the transcript."""
46
  sentences = sent_tokenize(transcript)
47
  important_sentences = [
48
  sentence for sentence in sentences
 
50
  ]
51
  return important_sentences
52
 
53
+ @app.route("/transcribe", methods=["POST"])
54
+ def transcribe():
55
+ """API endpoint to transcribe audio and generate notes."""
56
+ if "file" not in request.files:
57
+ return jsonify({"error": "No file uploaded"}), 400
58
+ file = request.files["file"]
59
+ file_path = "/tmp/audio_file.wav"
60
+ file.save(file_path)
61
+
62
+ # Transcribe the audio
63
+ transcript = transcribe_audio(file_path)
64
+
65
+ # Generate notes
66
+ notes = generate_notes(transcript)
67
+
68
+ return jsonify(notes)
69
+
70
  def generate_notes(transcript):
71
+ """Generate summarized notes based on keywords and important sentences."""
72
+ # Extract key sentences
73
  key_sentences = extract_key_sentences(transcript)
74
 
75
+ # Extract keywords
76
+ keywords = extract_keywords(transcript)
 
 
 
 
 
 
 
 
77
 
78
+ # Prepare notes
79
  notes = {
80
+ "short_questions": keywords[:5], # Select top 5 keywords as short questions
81
+ "long_questions": key_sentences[:3], # Select first 3 key sentences for long questions
82
+ "mcq": [{"question": f"What is {kw}?", "answer": "Yes/No"} for kw in keywords[:3]]
83
  }
84
 
85
  return notes
86
 
87
+ if __name__ == "__main__":
88
+ app.run(debug=True)