Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,212 +1,71 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from gradio_client import Client
|
3 |
-
import json
|
4 |
-
import logging
|
5 |
-
import openai
|
6 |
import os
|
|
|
|
|
|
|
7 |
import re
|
8 |
-
import html
|
9 |
-
|
10 |
-
# ๋ก๊น
์ค์
|
11 |
-
logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
|
12 |
-
format='%(asctime)s - %(levelname)s - %(message)s')
|
13 |
-
|
14 |
-
openai.api_key = os.getenv("OPENAI_API_KEY")
|
15 |
-
|
16 |
-
def parse_api_response(response):
|
17 |
-
try:
|
18 |
-
if isinstance(response, str):
|
19 |
-
response = json.loads(response)
|
20 |
-
if isinstance(response, list) and len(response) > 0:
|
21 |
-
response = response[0]
|
22 |
-
if not isinstance(response, dict):
|
23 |
-
raise ValueError(f"์์์น ๋ชปํ ์๋ต ํ์์
๋๋ค. ๋ฐ์ ๋ฐ์ดํฐ ํ์
: {type(response)}")
|
24 |
-
return response
|
25 |
-
except Exception as e:
|
26 |
-
logging.error(f"API ์๋ต ํ์ฑ ์คํจ: {str(e)}")
|
27 |
-
raise ValueError(f"API ์๋ต ํ์ฑ ์คํจ: {str(e)}")
|
28 |
-
|
29 |
-
def get_youtube_script(url):
|
30 |
-
logging.info(f"์คํฌ๋ฆฝํธ ์ถ์ถ ์์: URL = {url}")
|
31 |
-
client = Client("whispersound/YT_Ts_R")
|
32 |
-
try:
|
33 |
-
result = client.predict(youtube_url=url, api_name="/predict")
|
34 |
-
parsed_result = parse_api_response(result)
|
35 |
-
|
36 |
-
if 'data' not in parsed_result or not parsed_result['data']:
|
37 |
-
raise ValueError("API ์๋ต์ ์ ํจํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
38 |
-
|
39 |
-
data = parsed_result["data"][0]
|
40 |
-
title = data.get("title", "์ ๋ชฉ ์์")
|
41 |
-
description = data.get("description", "์ค๋ช
์์")
|
42 |
-
transcription_text = data.get("transcriptionAsText", "")
|
43 |
-
thumbnails = data.get("thumbnails", [])
|
44 |
-
|
45 |
-
if not transcription_text:
|
46 |
-
raise ValueError("์ถ์ถ๋ ์คํฌ๋ฆฝํธ๊ฐ ์์ต๋๋ค.")
|
47 |
-
|
48 |
-
logging.info("์คํฌ๋ฆฝํธ ์ถ์ถ ์๋ฃ")
|
49 |
-
return title, description, transcription_text, thumbnails
|
50 |
-
except Exception as e:
|
51 |
-
logging.exception("์คํฌ๋ฆฝํธ ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์")
|
52 |
-
raise
|
53 |
-
|
54 |
-
def call_api(prompt, max_tokens, temperature, top_p):
|
55 |
-
try:
|
56 |
-
response = openai.ChatCompletion.create(
|
57 |
-
model="gpt-4o-mini",
|
58 |
-
messages=[{"role": "user", "content": prompt}],
|
59 |
-
max_tokens=max_tokens,
|
60 |
-
temperature=temperature,
|
61 |
-
top_p=top_p
|
62 |
-
)
|
63 |
-
return response['choices'][0]['message']['content']
|
64 |
-
except Exception as e:
|
65 |
-
logging.exception("LLM API ํธ์ถ ์ค ์ค๋ฅ ๋ฐ์")
|
66 |
-
raise
|
67 |
-
|
68 |
-
def summarize_text(title, description, text):
|
69 |
-
prompt = f"""
|
70 |
-
[์ ํ๋ธ ์์ฝ ๊ท์น]
|
71 |
-
1. ๋๋ ์ ํ๋ธ ์์ ์ ๋ฌธ ํด์ค๊ฐ๋ก์ ์ง์นจ์ ๋ง๊ฒ ์ด ๊ธ์ ์์ฑํ๋ผ
|
72 |
-
2. ์๋์ ์ ๋ชฉ๊ณผ ์ค๋ช
์ ์ด ์ ํ๋ธ ์์์ ์๋ณธ ๋ฉํ๋ฐ์ดํฐ์ด๋ค.
|
73 |
-
3. ๋ฐ๋์ ์ ๋ชฉ๊ณผ ์ค๋ช
์ผ๋ก ์ฃผ์ ์ ๋ฌธ๋งฅ, ์ฒ ์(Spelling)์ ๋จผ์ ํ์
ํ๊ณ , ์๋์ ๋๋ณธ์ ๋ฐ๋์ ์ง์นจ์ ๋ง๊ฒ ์์ธํ๊ฒ ์์ฝํ๋ผ
|
74 |
-
- ๋ฐ๋์ ์ฃผ์ด์ง ์ ๋ชฉ, ์ค๋ช
์ ์๋ ์ฒ ์(Spelling)๋ฅผ ์์ฝ์ ๋ฐ์ํ๋ผ(์๋ฌธ ๋๋ณธ์๋ ์คํ์๊ฐ ์์ ์ ์๋ค)
|
75 |
-
4. ๋ฐ๋์ ํ๊ธ๋ก ์์ฑํ๋ผ
|
76 |
-
5. ๋ฐ๋์ '์ด ์ ํ๋ธ ๋๋ณธ์', '์ด ์์์', '์ด ์ ํ๋ธ๋'๋ฑ์ ์๊ฐ์ ํํ์ ์ ์ธํ๋ผ
|
77 |
-
6. ์์ฝ๋ฌธ๋ง์ผ๋ก๋ ์์์ ์ง์ ์์ฒญํ ๊ฒ๊ณผ ๋์ผํ ์์ค์ผ๋ก ๋ด์ฉ์ ์ดํดํ ์ ์๋๋ก ์์ธํ ์์ฑ
|
78 |
-
7. ๊ธ์ ๋๋ฌด ์์ถํ๊ฑฐ๋ ํจ์ถํ์ง ๋ง๊ณ , ์ค์ํ ๋ด์ฉ๊ณผ ์ธ๋ถ์ฌํญ์ ๋ชจ๋ ํฌํจ
|
79 |
-
8. ๋ฐ๋์ ๋๋ณธ์ ํ๋ฆ๊ณผ ๋
ผ๋ฆฌ ๊ตฌ์กฐ๋ฅผ ์ ์ง
|
80 |
-
9. ๋๋ณธ์ ๋ชฉ์ ์ด๋ ์๋๋ฅผ ํ์
ํ๊ณ , ์ด๋ฅผ ์์ฝ์ ๋ฐ๋์ ๋ฐ์
|
81 |
-
10. ๋ฐ๋์ ์๊ฐ ์์๋ ์ฌ๊ฑด์ ์ ๊ฐ ๊ณผ์ ์ ๋ช
ํํ๊ฒ ๋ฐ์
|
82 |
-
11. ๋ฑ์ฅ์ธ๋ฌผ, ์ฅ์, ์ฌ๊ฑด ๋ฑ ์ค์ํ ์์๋ฅผ ์ ํํ๊ฒ ์์ฑ
|
83 |
-
12. ๋๋ณธ์์ ์ ๋ฌํ๋ ๊ฐ์ ์ด๋ ๋ถ์๊ธฐ๋ ํฌํจ
|
84 |
-
13. ๋ฐ๋์ ๊ธฐ์ ์ ์ฉ์ด๋ ์ ๋ฌธ ์ฉ์ด๊ฐ ์์ ๊ฒฝ์ฐ, ์ด๋ฅผ ์ ํํ๊ฒ ์ฌ์ฉ
|
85 |
-
14. ๋ฐ๋์ ํต์ฌ ์น์
(์์ฃผ์ )๋ฅผ ํ์
ํ์ฌ ์น์
์ ๋ง๊ฒ ๊ธ์ ์์ฝํ๋ผ(๊ธ์ ์์ ๊ณ ๋ คํ์ฌ ์น์
์ ๊ฐ์๋ฅผ ํ๋ ฅ์ ์ผ๋ก ์ค์ )
|
86 |
-
15. ๊ฐ ์น์
์ ์ ๋ชฉ(์์ฃผ์ )์๋ ๋ด์ฉ๊ณผ ์ด์ธ๋ฆฌ๋ ์ ์ ํ ์ด๋ชจ์ง๋ก ์์ฃผ์ ๋ฅผ ์์ํ๋ผ
|
87 |
-
16. ๊ฐ ์น์
์ ๋ด์ฉ์ Bullet Point๋ฅผ ์ฌ์ฉํ์ฌ ๊ฐ๋
์ฑ์ ๋์ฌ๋ผ(๋ฌธ์ฅ ๋จ์๋ก ๊ตฌ๋ถ)
|
88 |
-
[์์]
|
89 |
-
(๋ณ๊ฒฝ์ )
|
90 |
-
- ์ ํ๋ธ๋ฅผ ์ฒ์ ์์ํ๋ ์ฌ๋๋ค์ ๊ตฌ๋
์ ์์ ์กฐํ์์ ํฐ ๊ด์ฌ์ ๋๊ณ ๋งค์ผ ์ ํ๋ธ ์คํ๋์ค๋ฅผ ํ์ธํ๊ฒ ๋๋ค. ๊ทธ๋ฌ๋ ๊ตฌ๋
์๊ฐ 100๋ช
, 1,000๋ช
์ ๋๋ฌํ๋ ๊ฒ๋ง์ผ๋ก๋ ์ง์์ ์ธ ์ฑ์ฅ์ ๋์์ด ๋์ง ์๋๋ค. ๊ตฌ๋
์ ์๊ฐ ๋์ด๋ ํ์๋ ์ ํ๋ธ ์ฑ๋ ์ด์์ ๋ํ ๊ฐ์ ์ก์ง ๋ชปํด ํฌ๊ธฐํ๋ ๊ฒฝ์ฐ๊ฐ ๋ง๋ค.
|
91 |
-
(๋ณ๊ฒฝํ)
|
92 |
-
- ์ ํ๋ธ๋ฅผ ์ฒ์ ์์ํ๋ ์ฌ๋๋ค์ ๊ตฌ๋
์ ์์ ์กฐํ์์ ํฐ ๊ด์ฌ์ ๋๊ณ ๋งค์ผ ์ ํ๋ธ ์คํ๋์ค๋ฅผ ํ์ธํ๊ฒ ๋๋ค.
|
93 |
-
- ๊ทธ๋ฌ๋ ๊ตฌ๋
์๊ฐ 100๋ช
, 1,000๋ช
์ ๋๋ฌํ๋ ๊ฒ๋ง์ผ๋ก๋ ์ง์์ ์ธ ์ฑ์ฅ์ ๋์์ด ๋์ง ์๋๋ค.
|
94 |
-
- ๊ตฌ๋
์ ์๊ฐ ๋์ด๋ ๏ฟฝ๏ฟฝ์๋ ์ ํ๋ธ ์ฑ๋ ์ด์์ ๋ํ ๊ฐ์ ์ก์ง ๋ชปํด ํฌ๊ธฐํ๋ ๊ฒฝ์ฐ๊ฐ ๋ง๋ค.
|
95 |
-
17. ๊ฐ ์น์
์ ๋ด์ฉ์ ๋ฐ๋์ ์ถฉ์คํ๊ฒ ์์ฑ
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
current_sentence = sentence.strip()
|
117 |
-
else:
|
118 |
-
current_sentence += sentence
|
119 |
-
if sentence.endswith(('.', '?', '!')):
|
120 |
-
combined_sentences.append(current_sentence.strip())
|
121 |
-
current_sentence = ""
|
122 |
-
if current_sentence:
|
123 |
-
combined_sentences.append(current_sentence.strip())
|
124 |
-
return combined_sentences
|
125 |
-
|
126 |
-
def display_script(title, script):
|
127 |
-
script_sentences = split_sentences(script)
|
128 |
-
formatted_script = "\n\n".join(script_sentences)
|
129 |
-
return f"""<div class="script-box">
|
130 |
-
<details>
|
131 |
-
<summary>ํด๋ฆญํ์ฌ ํผ์น๊ธฐ</summary>
|
132 |
-
<div class="output-title">{title}</div>
|
133 |
-
<p style="white-space: pre-wrap;">{formatted_script}</p>
|
134 |
-
</details>
|
135 |
-
</div>"""
|
136 |
-
|
137 |
-
def display_summary(title, summary):
|
138 |
-
return f"""<div class="script-box">
|
139 |
-
<div class="output-title">{title}</div>
|
140 |
-
{summary}
|
141 |
-
</div>"""
|
142 |
-
|
143 |
-
def get_thumbnail_url(thumbnails):
|
144 |
-
for thumbnail in thumbnails:
|
145 |
-
if thumbnail.get("width") == 640 and thumbnail.get("height") == 480:
|
146 |
-
return thumbnail.get("url")
|
147 |
-
return "640x480 ํฌ๊ธฐ์ ์ธ๋ค์ผ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
148 |
-
|
149 |
-
def analyze(url):
|
150 |
-
# ์คํฌ๋ฆฝํธ ์ถ์ถ
|
151 |
-
yield "์คํฌ๋ฆฝํธ ์ถ์ถ ์ค...", "์คํฌ๋ฆฝํธ ์ถ์ถ ์ค...", ""
|
152 |
-
title, description, script, thumbnails = get_youtube_script(url)
|
153 |
-
script_content = display_script(title, script)
|
154 |
-
thumbnail_url = get_thumbnail_url(thumbnails)
|
155 |
-
|
156 |
-
# ์๋ฌธ ์คํฌ๋ฆฝํธ ํ์ ๋ฐ ์์ฝ ์์
|
157 |
-
yield script_content, "์์ฝ ์์ฑ ์ค...", thumbnail_url
|
158 |
|
159 |
-
|
160 |
-
summary = summarize_text(title, description, script)
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
{formatted_summary}
|
191 |
-
</div>"""
|
192 |
|
193 |
-
#
|
194 |
-
|
195 |
-
|
196 |
-
# Gradio ์ธํฐํ์ด์ค
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
inputs=[youtube_url_input],
|
208 |
-
outputs=[script_output, summary_output, thumbnail_output] # thumbnail_output ์ถ๊ฐ
|
209 |
-
)
|
210 |
-
|
211 |
-
if __name__ == "__main__":
|
212 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import requests
|
3 |
+
import json
|
4 |
+
import gradio as gr
|
5 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
# Hugging Face ํ๊ฒฝ ๋ณ์๋ก๋ถํฐ RapidAPI ํค์ ํธ์คํธ ๊ฐ์ ธ์ค๊ธฐ
|
8 |
+
AA_KEY = os.getenv("AA_KEY")
|
9 |
+
AA_HOST = "youtube-transcriptor.p.rapidapi.com"
|
10 |
+
|
11 |
+
# ์ ํ๋ธ URL์์ ๋น๋์ค ID๋ฅผ ์ถ์ถํ๋ ํจ์
|
12 |
+
def get_video_id(youtube_url):
|
13 |
+
# ์ ํ๋ธ URL ๋๋ youtu.be ๋จ์ถ URL์์ video_id ์ถ์ถ
|
14 |
+
video_id_match = re.search(r"(?<=v=)[^#&?]*", youtube_url) or re.search(r"(?<=youtu.be/)[^#&?]*", youtube_url)
|
15 |
+
return video_id_match.group(0) if video_id_match else None
|
16 |
+
|
17 |
+
# ์๋ง ์ธ์ด ์ฐ์ ์์ ๋ฆฌ์คํธ
|
18 |
+
LANGUAGE_PRIORITY = ['ko', 'en', 'ja', 'zh']
|
19 |
+
|
20 |
+
# ์ ํ๋ธ ์๋ง์ ์์ฒญํ๋ ํจ์ (์ธ์ด ์ฐ์ ์์๋ฅผ ์ ์ฉํ์ฌ ์๋)
|
21 |
+
def get_youtube_transcript(youtube_url):
|
22 |
+
# ๋น๋์ค ID ์ถ์ถ
|
23 |
+
video_id = get_video_id(youtube_url)
|
24 |
+
if video_id is None:
|
25 |
+
return {"error": "์๋ชป๋ ์ ํ๋ธ URL์
๋๋ค. ๋น๋์ค ID๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
|
|
|
28 |
|
29 |
+
headers = {
|
30 |
+
"x-rapidapi-key": AA_KEY,
|
31 |
+
"x-rapidapi-host": AA_HOST
|
32 |
+
}
|
33 |
+
|
34 |
+
# ์ธ์ด ์ฐ์ ์์์ ๋ฐ๋ผ ์๏ฟฝ๏ฟฝ์ ์ผ๋ก ์์ฒญ์ ์๋
|
35 |
+
for lang in LANGUAGE_PRIORITY:
|
36 |
+
querystring = {"video_id": video_id, "lang": lang}
|
37 |
+
response = requests.get(url, headers=headers, params=querystring)
|
38 |
+
|
39 |
+
# ์ํ ์ฝ๋ ํ์ธ ๋ฐ ์ ์ฒด ์๋ต ๋ฐํ
|
40 |
+
if response.status_code == 200:
|
41 |
+
try:
|
42 |
+
data = response.json()
|
43 |
+
|
44 |
+
# ์ ์ฒด ์๋ต ๋ฐ์ดํฐ๋ฅผ ๊ทธ๋๋ก ๋ฐํ
|
45 |
+
return {"language": lang, "data": data}
|
46 |
+
|
47 |
+
except json.JSONDecodeError as e:
|
48 |
+
return {"error": f"JSON ๋์ฝ๋ฉ ์ค๋ฅ ๋ฐ์: {str(e)}"}
|
49 |
+
|
50 |
+
# ๋ชจ๋ ์ธ์ด์์ ์๋ง์ ์ฐพ์ง ๋ชปํ ๊ฒฝ์ฐ
|
51 |
+
return {"error": "์ฐ์ ์์ ์ธ์ด๋ก ์๋ง์ ์ฐพ์ ์ ์์ต๋๋ค."}
|
52 |
+
|
53 |
+
# Gradio ์ธํฐํ์ด์ค ์ ์
|
54 |
+
def youtube_transcript_interface(youtube_url):
|
55 |
+
# ์๋ง ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
|
56 |
+
transcript_data = get_youtube_transcript(youtube_url)
|
|
|
|
|
57 |
|
58 |
+
# ๊ฒฐ๊ณผ ์ถ๋ ฅ
|
59 |
+
return json.dumps(transcript_data, ensure_ascii=False, indent=2)
|
60 |
+
|
61 |
+
# Gradio ์ธํฐํ์ด์ค ์์ฑ
|
62 |
+
interface = gr.Interface(
|
63 |
+
fn=youtube_transcript_interface,
|
64 |
+
inputs="text",
|
65 |
+
outputs="text",
|
66 |
+
title="YouTube ์๋ง ์ถ์ถ๊ธฐ",
|
67 |
+
description="์ ํ๋ธ URL์ ์
๋ ฅํ์ธ์."
|
68 |
+
)
|
69 |
+
|
70 |
+
# Gradio ์ธํฐํ์ด์ค ์คํ
|
71 |
+
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|