Spaces:

ganesh3
/

rag-youtube-assistant

Running

App Files Files Community

rag-youtube-assistant / app /generate_ground_truth.py

ganesh3

third commit

25b2b2b 8 months ago

raw

history blame

3.41 kB

	import pandas as pd
	import json
	from tqdm import tqdm
	import ollama
	from transcript_extractor import get_transcript

	def generate_questions(transcript):
	prompt_template = """
	You are an AI assistant tasked with generating questions based on a YouTube video transcript.
	Formulate at least 10 questions that a user might ask based on the provided transcript.
	Make the questions specific to the content of the transcript.
	The questions should be complete and not too short. Use as few words as possible from the transcript.
	It is important that the questions are relevant to the content of the transcript and are at least 10 in number.

	The transcript:

	{transcript}

	Provide the output in parsable JSON without using code blocks:

	{{"questions": ["question1", "question2", ..., "question10"]}}
	""".strip()

	prompt = prompt_template.format(transcript=transcript)

	try:
	response = ollama.chat(
	model='phi3.5',
	messages=[{"role": "user", "content": prompt}]
	)
	return json.loads(response['message']['content'])
	except Exception as e:
	print(f"Error generating questions: {str(e)}")
	return None

	def generate_ground_truth(db_handler, data_processor, video_id):
	transcript_data = get_transcript(video_id)
	if transcript_data and 'transcript' in transcript_data:
	full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
	# Process the transcript
	data_processor.process_transcript(video_id, transcript_data)
	else:
	print(f"Failed to retrieve transcript for video {video_id}")
	return None

	questions = generate_questions(full_transcript)

	if questions and 'questions' in questions:
	df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])

	csv_path = 'data/ground-truth-retrieval.csv'
	df.to_csv(csv_path, index=False)
	print(f"Ground truth data saved to {csv_path}")
	return df
	else:
	print("Failed to generate questions.")
	return None

	def generate_ground_truth_for_all_videos(db_handler, data_processor):
	videos = db_handler.get_all_videos()
	all_questions = []

	for video in tqdm(videos, desc="Generating ground truth"):
	video_id = video[0] # Assuming the video ID is the first element in the tuple
	transcript_data = get_transcript(video_id)
	if transcript_data and 'transcript' in transcript_data:
	full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
	# Process the transcript
	data_processor.process_transcript(video_id, transcript_data)
	questions = generate_questions(full_transcript)
	if questions and 'questions' in questions:
	all_questions.extend([(video_id, q) for q in questions['questions']])
	else:
	print(f"Failed to retrieve transcript for video {video_id}")

	if all_questions:
	df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
	csv_path = 'data/ground-truth-retrieval.csv'
	df.to_csv(csv_path, index=False)
	print(f"Ground truth data for all videos saved to {csv_path}")
	return df
	else:
	print("Failed to generate questions for any video.")
	return None