JabriA commited on
Commit
deca047
·
1 Parent(s): 98899e8

Add Darija transcription and topic extraction app7

Browse files
Files changed (1) hide show
  1. app.py +9 -19
app.py CHANGED
@@ -3,24 +3,20 @@ import torch
3
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline
4
  from transformers import BertTokenizer, BertForSequenceClassification
5
  import librosa
6
- import os
7
-
8
- # Set up proxy for internal testing
9
- os.environ["HTTP_PROXY"] = "http://meditelproxy.meditel.int:80"
10
- os.environ["HTTPS_PROXY"] = "http://meditelproxy.meditel.int:80"
11
 
12
  # Load models
13
  # Transcription model for Moroccan Darija
14
  processor = Wav2Vec2Processor.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
15
  transcription_model = Wav2Vec2ForCTC.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
16
 
17
- # Summarization model (for French summaries)
18
- summarizer = pipeline("summarization", model="facebook/mbart-large-50-many-to-many-mmt")
19
 
20
  # Topic Classification Model (BERT for example)
21
  topic_model = BertForSequenceClassification.from_pretrained("bert-base-uncased") # Example model
22
  topic_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
23
 
 
24
  # Function to resample audio to 16kHz if necessary
25
  def resample_audio(audio_path, target_sr=16000):
26
  audio_input, original_sr = librosa.load(audio_path, sr=None) # Load audio with original sampling rate
@@ -61,19 +57,13 @@ def classify_topic(transcription):
61
  else:
62
  return "Other"
63
 
64
- # Function to transcribe, summarize in French, and classify topic
65
  def transcribe_and_summarize(audio_file):
66
  # Transcription
67
  transcription = transcribe_audio(audio_file)
68
 
69
- # Summarization in French
70
- summary = summarizer(
71
- transcription,
72
- max_length=50,
73
- min_length=10,
74
- do_sample=False,
75
- tgt_lang="fr_XX" # Target language set to French
76
- )[0]["summary_text"]
77
 
78
  # Topic classification
79
  topic = classify_topic(transcription)
@@ -84,7 +74,7 @@ def transcribe_and_summarize(audio_file):
84
  inputs = gr.Audio(type="filepath", label="Upload your audio file")
85
  outputs = [
86
  gr.Textbox(label="Transcription"),
87
- gr.Textbox(label="Résumé (en Français)"),
88
  gr.Textbox(label="Topic")
89
  ]
90
 
@@ -92,8 +82,8 @@ app = gr.Interface(
92
  fn=transcribe_and_summarize,
93
  inputs=inputs,
94
  outputs=outputs,
95
- title="Moroccan Darija Audio Transcription, Résumé, and Topic Classification",
96
- description="Upload an audio file in Moroccan Darija to get its transcription, a summarized version in French, and the detected topic."
97
  )
98
 
99
  # Launch the app
 
3
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline
4
  from transformers import BertTokenizer, BertForSequenceClassification
5
  import librosa
 
 
 
 
 
6
 
7
  # Load models
8
  # Transcription model for Moroccan Darija
9
  processor = Wav2Vec2Processor.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
10
  transcription_model = Wav2Vec2ForCTC.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
11
 
12
+ # Summarization model
13
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
14
 
15
  # Topic Classification Model (BERT for example)
16
  topic_model = BertForSequenceClassification.from_pretrained("bert-base-uncased") # Example model
17
  topic_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
18
 
19
+
20
  # Function to resample audio to 16kHz if necessary
21
  def resample_audio(audio_path, target_sr=16000):
22
  audio_input, original_sr = librosa.load(audio_path, sr=None) # Load audio with original sampling rate
 
57
  else:
58
  return "Other"
59
 
60
+ # Function to transcribe, summarize, and classify topic
61
  def transcribe_and_summarize(audio_file):
62
  # Transcription
63
  transcription = transcribe_audio(audio_file)
64
 
65
+ # Summarization
66
+ summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
 
 
 
 
 
 
67
 
68
  # Topic classification
69
  topic = classify_topic(transcription)
 
74
  inputs = gr.Audio(type="filepath", label="Upload your audio file")
75
  outputs = [
76
  gr.Textbox(label="Transcription"),
77
+ gr.Textbox(label="Summary"),
78
  gr.Textbox(label="Topic")
79
  ]
80
 
 
82
  fn=transcribe_and_summarize,
83
  inputs=inputs,
84
  outputs=outputs,
85
+ title="Moroccan Darija Audio Transcription, Summarization, and Topic Classification",
86
+ description="Upload an audio file in Moroccan Darija to get its transcription, a summarized version of the content, and the detected topic."
87
  )
88
 
89
  # Launch the app