File size: 3,180 Bytes
4300fed
610f79e
4300fed
61c12f6
a3bb4a3
 
3661e80
a3bb4a3
7ffbb2d
4300fed
 
4fbe883
2227484
a3bb4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532dc11
3661e80
 
 
 
 
 
a3bb4a3
610f79e
a3bb4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1821dd9
a3bb4a3
1821dd9
a3bb4a3
610f79e
4fbe883
a3bb4a3
61c12f6
4fbe883
 
 
a3bb4a3
 
 
 
4fbe883
 
 
 
a3bb4a3
 
4fbe883
 
a3bb4a3
4300fed
a3bb4a3
 
 
4300fed
610f79e
a3bb4a3
4300fed
610f79e
70cbf96
881961f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
import spaces
import os, torch, io
import json

os.system("python -m unidic download")
import httpx

# print("Make sure you've downloaded unidic (python -m unidic download) for this WebUI to work.")
from melo.api import TTS
import tempfile
import wave
from pydub import AudioSegment
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
    BitsAndBytesConfig,
)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Hermes-2-Pro-Llama-3-8B",
    quantization_config=quantization_config,
    token=token,
)
tok = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", token=token)
terminators = [tok.eos_token_id, tok.convert_tokens_to_ids("<|eot_id|>")]


def fetch_text(url):
    prefix_url = "https://r.jina.ai/"
    url = prefix_url + url
    response = httpx.get(url, timeout=60.0)
    return response.text


@spaces.GPU
def synthesize(article_url, progress=gr.Progress()):
    text = fetch_text(article_url)
    template = """
        {
            "conversation": [
                {"speaker": "", "text": ""},
                {"speaker": "", "text": ""}
            ]
        }
        """

    chat = [
        {
            "role": "user",
            "content": f"{text} \n Convert the text as Elaborate Conversation between two people as Podcast.\nfollowing this template \n {template}",
        }
    ]
    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    model_inputs = tok([messages], return_tensors="pt").to(device)

    text = model.generate(
        model_inputs,
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.9,
        eos_token_id=terminators,
    )

    speed = 1.0
    device = "cuda" if torch.cuda.is_available() else "cpu"
    models = {
        "EN": TTS(language="EN", device=device),
    }
    speakers = ["EN-Default", "EN-US"]

    combined_audio = AudioSegment.empty()
    conversation = json.loads(text)
    for i, turn in enumerate(conversation["conversation"]):
        bio = io.BytesIO()
        text = turn["text"]
        speaker = speakers[i % 2]
        speaker_id = models["EN"].hps.data.spk2id[speaker]
        models["EN"].tts_to_file(
            text, speaker_id, bio, speed=speed, pbar=progress.tqdm, format="wav"
        )
        bio.seek(0)
        audio_segment = AudioSegment.from_file(bio, format="wav")
        combined_audio += audio_segment

    final_audio_path = "final.mp3"
    combined_audio.export(final_audio_path, format="mp3")
    return final_audio_path


with gr.Blocks() as demo:
    gr.Markdown("# Not Ready to USE")
    gr.Markdown("# Turn Any Article into Podcast")
    gr.Markdown("## Easily convert articles from URLs into listenable audio Podcast.")
    with gr.Group():
        text = gr.Textbox(label="Article Link")
    btn = gr.Button("Podcasitfy", variant="primary")
    aud = gr.Audio(interactive=False)
    btn.click(synthesize, inputs=[text], outputs=[aud])

demo.queue(api_open=True, default_concurrency_limit=10).launch(show_api=True)