JabriA commited on
Commit
98899e8
·
1 Parent(s): fcf4167

Add Darija transcription and topic extraction app8

Browse files
Files changed (1) hide show
  1. app.py +19 -9
app.py CHANGED
@@ -3,20 +3,24 @@ import torch
3
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline
4
  from transformers import BertTokenizer, BertForSequenceClassification
5
  import librosa
 
 
 
 
 
6
 
7
  # Load models
8
  # Transcription model for Moroccan Darija
9
  processor = Wav2Vec2Processor.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
10
  transcription_model = Wav2Vec2ForCTC.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
11
 
12
- # Summarization model
13
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
14
 
15
  # Topic Classification Model (BERT for example)
16
  topic_model = BertForSequenceClassification.from_pretrained("bert-base-uncased") # Example model
17
  topic_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
18
 
19
-
20
  # Function to resample audio to 16kHz if necessary
21
  def resample_audio(audio_path, target_sr=16000):
22
  audio_input, original_sr = librosa.load(audio_path, sr=None) # Load audio with original sampling rate
@@ -57,13 +61,19 @@ def classify_topic(transcription):
57
  else:
58
  return "Other"
59
 
60
- # Function to transcribe, summarize, and classify topic
61
  def transcribe_and_summarize(audio_file):
62
  # Transcription
63
  transcription = transcribe_audio(audio_file)
64
 
65
- # Summarization
66
- summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
 
 
 
 
 
 
67
 
68
  # Topic classification
69
  topic = classify_topic(transcription)
@@ -74,7 +84,7 @@ def transcribe_and_summarize(audio_file):
74
  inputs = gr.Audio(type="filepath", label="Upload your audio file")
75
  outputs = [
76
  gr.Textbox(label="Transcription"),
77
- gr.Textbox(label="Summary"),
78
  gr.Textbox(label="Topic")
79
  ]
80
 
@@ -82,8 +92,8 @@ app = gr.Interface(
82
  fn=transcribe_and_summarize,
83
  inputs=inputs,
84
  outputs=outputs,
85
- title="Moroccan Darija Audio Transcription, Summarization, and Topic Classification",
86
- description="Upload an audio file in Moroccan Darija to get its transcription, a summarized version of the content, and the detected topic."
87
  )
88
 
89
  # Launch the app
 
3
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline
4
  from transformers import BertTokenizer, BertForSequenceClassification
5
  import librosa
6
+ import os
7
+
8
+ # Set up proxy for internal testing
9
+ os.environ["HTTP_PROXY"] = "http://meditelproxy.meditel.int:80"
10
+ os.environ["HTTPS_PROXY"] = "http://meditelproxy.meditel.int:80"
11
 
12
  # Load models
13
  # Transcription model for Moroccan Darija
14
  processor = Wav2Vec2Processor.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
15
  transcription_model = Wav2Vec2ForCTC.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
16
 
17
+ # Summarization model (for French summaries)
18
+ summarizer = pipeline("summarization", model="facebook/mbart-large-50-many-to-many-mmt")
19
 
20
  # Topic Classification Model (BERT for example)
21
  topic_model = BertForSequenceClassification.from_pretrained("bert-base-uncased") # Example model
22
  topic_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
23
 
 
24
  # Function to resample audio to 16kHz if necessary
25
  def resample_audio(audio_path, target_sr=16000):
26
  audio_input, original_sr = librosa.load(audio_path, sr=None) # Load audio with original sampling rate
 
61
  else:
62
  return "Other"
63
 
64
+ # Function to transcribe, summarize in French, and classify topic
65
  def transcribe_and_summarize(audio_file):
66
  # Transcription
67
  transcription = transcribe_audio(audio_file)
68
 
69
+ # Summarization in French
70
+ summary = summarizer(
71
+ transcription,
72
+ max_length=50,
73
+ min_length=10,
74
+ do_sample=False,
75
+ tgt_lang="fr_XX" # Target language set to French
76
+ )[0]["summary_text"]
77
 
78
  # Topic classification
79
  topic = classify_topic(transcription)
 
84
  inputs = gr.Audio(type="filepath", label="Upload your audio file")
85
  outputs = [
86
  gr.Textbox(label="Transcription"),
87
+ gr.Textbox(label="Résumé (en Français)"),
88
  gr.Textbox(label="Topic")
89
  ]
90
 
 
92
  fn=transcribe_and_summarize,
93
  inputs=inputs,
94
  outputs=outputs,
95
+ title="Moroccan Darija Audio Transcription, Résumé, and Topic Classification",
96
+ description="Upload an audio file in Moroccan Darija to get its transcription, a summarized version in French, and the detected topic."
97
  )
98
 
99
  # Launch the app