Spaces:
Running
Running
File size: 3,409 Bytes
dbd33b2 25b2b2b dbd33b2 25b2b2b dbd33b2 25b2b2b dbd33b2 25b2b2b dbd33b2 25b2b2b dbd33b2 25b2b2b dbd33b2 25b2b2b dbd33b2 25b2b2b dbd33b2 25b2b2b dbd33b2 25b2b2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import pandas as pd
import json
from tqdm import tqdm
import ollama
from transcript_extractor import get_transcript
def generate_questions(transcript):
prompt_template = """
You are an AI assistant tasked with generating questions based on a YouTube video transcript.
Formulate at least 10 questions that a user might ask based on the provided transcript.
Make the questions specific to the content of the transcript.
The questions should be complete and not too short. Use as few words as possible from the transcript.
It is important that the questions are relevant to the content of the transcript and are at least 10 in number.
The transcript:
{transcript}
Provide the output in parsable JSON without using code blocks:
{{"questions": ["question1", "question2", ..., "question10"]}}
""".strip()
prompt = prompt_template.format(transcript=transcript)
try:
response = ollama.chat(
model='phi3.5',
messages=[{"role": "user", "content": prompt}]
)
return json.loads(response['message']['content'])
except Exception as e:
print(f"Error generating questions: {str(e)}")
return None
def generate_ground_truth(db_handler, data_processor, video_id):
transcript_data = get_transcript(video_id)
if transcript_data and 'transcript' in transcript_data:
full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
# Process the transcript
data_processor.process_transcript(video_id, transcript_data)
else:
print(f"Failed to retrieve transcript for video {video_id}")
return None
questions = generate_questions(full_transcript)
if questions and 'questions' in questions:
df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
csv_path = 'data/ground-truth-retrieval.csv'
df.to_csv(csv_path, index=False)
print(f"Ground truth data saved to {csv_path}")
return df
else:
print("Failed to generate questions.")
return None
def generate_ground_truth_for_all_videos(db_handler, data_processor):
videos = db_handler.get_all_videos()
all_questions = []
for video in tqdm(videos, desc="Generating ground truth"):
video_id = video[0] # Assuming the video ID is the first element in the tuple
transcript_data = get_transcript(video_id)
if transcript_data and 'transcript' in transcript_data:
full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
# Process the transcript
data_processor.process_transcript(video_id, transcript_data)
questions = generate_questions(full_transcript)
if questions and 'questions' in questions:
all_questions.extend([(video_id, q) for q in questions['questions']])
else:
print(f"Failed to retrieve transcript for video {video_id}")
if all_questions:
df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
csv_path = 'data/ground-truth-retrieval.csv'
df.to_csv(csv_path, index=False)
print(f"Ground truth data for all videos saved to {csv_path}")
return df
else:
print("Failed to generate questions for any video.")
return None |