Tai Truong
fix readme
d202ada
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
from langflow.custom import Component
from langflow.inputs import DropdownInput, IntInput, MultilineInput
from langflow.schema import Data
from langflow.template import Output
class YouTubeTranscriptsComponent(Component):
"""A component that extracts spoken content from YouTube videos as transcripts."""
display_name: str = "YouTube Transcripts"
description: str = "Extracts spoken content from YouTube videos as transcripts."
icon: str = "YouTube"
name = "YouTubeTranscripts"
inputs = [
MultilineInput(
name="url",
display_name="Video URL",
info="Enter the YouTube video URL to get transcripts from.",
tool_mode=True,
),
DropdownInput(
name="transcript_format",
display_name="Transcript Format",
options=["text", "chunks"],
value="text",
info="The format of the transcripts. Either 'text' for a single output "
"or 'chunks' for timestamped chunks.",
advanced=True,
),
IntInput(
name="chunk_size_seconds",
display_name="Chunk Size (seconds)",
value=60,
advanced=True,
info="The size of each transcript chunk in seconds. Only applicable when "
"'Transcript Format' is set to 'chunks'.",
),
DropdownInput(
name="language",
display_name="Language",
options=[
"af",
"ak",
"sq",
"am",
"ar",
"hy",
"as",
"ay",
"az",
"bn",
"eu",
"be",
"bho",
"bs",
"bg",
"my",
"ca",
"ceb",
"zh",
"zh-HK",
"zh-CN",
"zh-SG",
"zh-TW",
"zh-Hans",
"zh-Hant",
"hak-TW",
"nan-TW",
"co",
"hr",
"cs",
"da",
"dv",
"nl",
"en",
"en-US",
"eo",
"et",
"ee",
"fil",
"fi",
"fr",
"gl",
"lg",
"ka",
"de",
"el",
"gn",
"gu",
"ht",
"ha",
"haw",
"iw",
"hi",
"hmn",
"hu",
"is",
"ig",
"id",
"ga",
"it",
"ja",
"jv",
"kn",
"kk",
"km",
"rw",
"ko",
"kri",
"ku",
"ky",
"lo",
"la",
"lv",
"ln",
"lt",
"lb",
"mk",
"mg",
"ms",
"ml",
"mt",
"mi",
"mr",
"mn",
"ne",
"nso",
"no",
"ny",
"or",
"om",
"ps",
"fa",
"pl",
"pt",
"pa",
"qu",
"ro",
"ru",
"sm",
"sa",
"gd",
"sr",
"sn",
"sd",
"si",
"sk",
"sl",
"so",
"st",
"es",
"su",
"sw",
"sv",
"tg",
"ta",
"tt",
"te",
"th",
"ti",
"ts",
"tr",
"tk",
"uk",
"ur",
"ug",
"uz",
"vi",
"cy",
"fy",
"xh",
"yi",
"yo",
"zu",
],
value="en",
info=(
"Specify to make sure the transcripts are retrieved in your desired language. "
"Defaults to English: 'en'"
),
),
DropdownInput(
name="translation",
display_name="Translation Language",
advanced=True,
options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"],
info="Translate the transcripts to the specified language. " "Leave empty for no translation.",
),
]
outputs = [
Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
]
def build_youtube_transcripts(self) -> Data | list[Data]:
"""Method to build transcripts from the provided YouTube URL.
Returns:
Data | list[Data]: The transcripts of the video, either as a single
Data object or a list of Data objects.
"""
try:
loader = YoutubeLoader.from_youtube_url(
self.url,
transcript_format=TranscriptFormat.TEXT
if self.transcript_format == "text"
else TranscriptFormat.CHUNKS,
chunk_size_seconds=self.chunk_size_seconds,
language=[self.language],
translation=self.translation or None,
)
transcripts = loader.load()
if self.transcript_format == "text":
# Extract only the page_content from the Document
return Data(data={"transcripts": transcripts[0].page_content})
# For chunks, extract page_content and metadata separately
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
except Exception as exc: # noqa: BLE001
# Using a specific error type for the return value
return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"})