emirhanbilgic commited on
Commit
23f3f75
·
verified ·
1 Parent(s): 347bb89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -109
app.py CHANGED
@@ -1,47 +1,24 @@
1
- import spaces # Import this first to prevent CUDA initialization before spaces
2
-
3
  import gradio as gr
4
  import torch
5
  from transformers import MarianTokenizer, MarianMTModel
6
  from parler_tts import ParlerTTSForConditionalGeneration
7
- from transformers import AutoTokenizer
8
- import soundfile as sf
9
- from pydub import AudioSegment
10
- import os
11
- import re
12
  from PyPDF2 import PdfReader
 
13
  import textwrap
 
14
 
15
  # Device configuration
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
 
18
- # Initialize models and tokenizers outside the functions
19
  tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
20
  tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
 
 
21
 
22
- # Translation function
23
- @spaces.GPU(duration=120)
24
- def translate(source_text, source_lang, target_lang, batch_size=16):
25
- model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
26
-
27
- tokenizer = MarianTokenizer.from_pretrained(model_name)
28
- model = MarianMTModel.from_pretrained(model_name).to(device)
29
-
30
- text_chunks = textwrap.wrap(source_text, 512)
31
- translated_text = ""
32
-
33
- for i in range(0, len(text_chunks), batch_size):
34
- text_batch = text_chunks[i:i+batch_size]
35
- input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
36
- output_ids = model.generate(input_ids, max_new_tokens=512)
37
-
38
- for output in output_ids:
39
- output_text = tokenizer.decode(output, skip_special_tokens=True)
40
- translated_text += output_text + " "
41
-
42
- return translated_text
43
-
44
- # Function to extract text from PDF
45
  def pdf_to_text(pdf_path):
46
  with open(pdf_path, 'rb') as file:
47
  pdf_reader = PdfReader(file)
@@ -51,101 +28,64 @@ def pdf_to_text(pdf_path):
51
  text += page.extract_text()
52
  return text
53
 
54
- # Function to split text into sentences using regex
55
  def split_text_into_sentences(text):
56
  sentence_endings = re.compile(r'[.!?]')
57
  sentences = sentence_endings.split(text)
58
  return [sentence.strip() for sentence in sentences if sentence.strip()]
59
 
 
 
 
 
 
 
 
60
  # Function to generate audio for a single sentence
61
  @spaces.GPU(duration=120)
62
- def generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer):
63
- input_ids = tts_tokenizer(sentence, return_tensors="pt").input_ids.to(device)
64
- prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
65
-
66
- generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
 
 
 
 
67
  audio_arr = generation.cpu().numpy().squeeze()
68
- output_file = f"{output_file_prefix}.wav"
69
- sf.write(output_file, audio_arr, tts_model.config.sampling_rate)
70
- return output_file
71
-
72
- # Function to combine audio files
73
- def combine_wav_files(output_file, *input_files, silence_duration=500):
74
- combined = AudioSegment.empty()
75
- one_second_silence = AudioSegment.silent(duration=silence_duration)
76
-
77
- for file in input_files:
78
- audio = AudioSegment.from_wav(file)
79
- combined += audio + one_second_silence
80
-
81
- combined.export(output_file, format='wav')
82
-
83
- # Function to update target language options based on the source language
84
- def update_target_lang_options(source_lang):
85
- options = {
86
- "en": ["de", "fr", "tr"],
87
- "tr": ["en"],
88
- "de": ["en", "fr"],
89
- "fr": ["en", "de"]
90
- }
91
- return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
92
-
93
- # Function to process sentences for audio generation
94
- def process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer):
95
- audio_files = []
96
 
97
- for i, sentence in enumerate(sentences):
98
- print(f"Generating audio for sentence {i+1}...")
99
- output_file_prefix = f"sentence_{i+1}"
100
- audio_file = generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer)
101
- audio_files.append(audio_file)
102
-
103
- yield sentence, audio_file
104
-
105
- combined_output_file = "sentences_combined.wav"
106
- combine_wav_files(combined_output_file, *audio_files)
107
-
108
- yield None, combined_output_file
109
-
110
- # Gradio interface
111
  with gr.Blocks() as demo:
112
  with gr.Row():
113
- with gr.Column(scale=1):
114
  pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
115
- translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
116
- source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
117
- target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
118
- description = gr.Textbox(label="Voice Description",
119
  value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
120
- process_btn = gr.Button("Process")
121
- with gr.Column(scale=2):
122
- gr.Markdown("### Generated Audio")
123
- output_group = gr.Group()
124
 
125
- def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
 
126
  text = pdf_to_text(pdf_input.name)
127
- if translate_checkbox:
128
- text = translate(text, source_lang, target_lang)
129
-
130
  sentences = split_text_into_sentences(text)
131
- for sentence, audio_file in process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer):
132
- if sentence:
133
- with output_group:
134
- gr.Markdown(f"**Sentence**: {sentence}")
135
- gr.Audio(value=audio_file, label=sentence)
136
- else:
137
- with output_group:
138
- gr.Markdown("### Combined Audio")
139
- gr.Audio(value=audio_file, label="Combined Audio")
140
 
141
- def handle_translation_toggle(translate_checkbox):
142
- if translate_checkbox:
143
- return gr.update(visible=True), gr.update(visible=True)
144
- else:
145
- return gr.update(visible=False), gr.update(visible=False)
 
 
 
 
 
 
146
 
147
- translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
148
- source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang)
149
- process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[])
150
 
 
151
  demo.launch()
 
1
+ import spaces
 
2
  import gradio as gr
3
  import torch
4
  from transformers import MarianTokenizer, MarianMTModel
5
  from parler_tts import ParlerTTSForConditionalGeneration
6
+ from transformers import AutoTokenizer, set_seed
 
 
 
 
7
  from PyPDF2 import PdfReader
8
+ import re
9
  import textwrap
10
+ import soundfile as sf
11
 
12
  # Device configuration
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
 
15
+ # Initialize models and tokenizers
16
  tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
17
  tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
18
+ SAMPLE_RATE = 22050 # Adjust as needed
19
+ SEED = 42
20
 
21
+ # Helper function to extract text from a PDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def pdf_to_text(pdf_path):
23
  with open(pdf_path, 'rb') as file:
24
  pdf_reader = PdfReader(file)
 
28
  text += page.extract_text()
29
  return text
30
 
31
+ # Helper function to split text into sentences using regex
32
  def split_text_into_sentences(text):
33
  sentence_endings = re.compile(r'[.!?]')
34
  sentences = sentence_endings.split(text)
35
  return [sentence.strip() for sentence in sentences if sentence.strip()]
36
 
37
+ # Helper function to preprocess the text (normalization, punctuation)
38
+ def preprocess(text):
39
+ text = text.replace("-", " ")
40
+ if text[-1] not in ".!?":
41
+ text += "."
42
+ return text
43
+
44
  # Function to generate audio for a single sentence
45
  @spaces.GPU(duration=120)
46
+ def generate_single_wav_from_text(sentence, description):
47
+ set_seed(SEED)
48
+ inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
49
+ prompt = tts_tokenizer(preprocess(sentence), return_tensors="pt").to(device)
50
+
51
+ generation = tts_model.generate(
52
+ input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
53
+ prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
54
+ )
55
  audio_arr = generation.cpu().numpy().squeeze()
56
+ output_file = f"sentence.wav"
57
+ sf.write(output_file, audio_arr, SAMPLE_RATE)
58
+ return SAMPLE_RATE, audio_arr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  with gr.Blocks() as demo:
62
  with gr.Row():
63
+ with gr.Column():
64
  pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
65
+ description = gr.Textbox(label="Voice Description", lines=2,
 
 
 
66
  value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
67
+ run_button = gr.Button("Generate Audio", variant="primary")
68
+ with gr.Column():
69
+ audio_output = gr.Audio(label="Generated Audio")
 
70
 
71
+ def handle_process(pdf_input, description):
72
+ # Extract and process text from PDF
73
  text = pdf_to_text(pdf_input.name)
 
 
 
74
  sentences = split_text_into_sentences(text)
 
 
 
 
 
 
 
 
 
75
 
76
+ for sentence in sentences:
77
+ # Generate audio for each sentence
78
+ sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
79
+ yield sentence, sample_rate, audio_arr
80
+
81
+ def run_pipeline(pdf_input, description):
82
+ # Stream outputs to Gradio interface
83
+ for sentence, sample_rate, audio_arr in handle_process(pdf_input, description):
84
+ gr.Markdown(f"**Sentence**: {sentence}")
85
+ audio_output.update(value=(sample_rate, audio_arr))
86
+ yield
87
 
88
+ run_button.click(run_pipeline, inputs=[pdf_input, description], outputs=[audio_output])
 
 
89
 
90
+ demo.queue()
91
  demo.launch()