from datasets import load_dataset

# Load the dataset (specify split as 'train' to load the training data)
dataset = load_dataset('tom-010/google_natural_questions_answerability', split='train')

# Function to filter based on a query/topic and return relevant data
def search_relevant_data(topic="Artificial Intelligence", max_words=250, top_n=250):
    # Filter the dataset based on the presence of the topic in 'question', 'answer', or 'text' fields
    filtered_data = dataset.filter(
        lambda x: (
            (x['question'] is not None and topic.lower() in x['question'].lower()) or
            (x['answer'] is not None and topic.lower() in x['answer'].lower()) or
            (x['text'] is not None and topic.lower() in x['text'].lower())
        )
    )

    # Ensure we only select up to the available number of rows
    #num_to_select = min(top_n, len(filtered_data))  # Choose the minimum of top_n and available data
    #filtered_data = filtered_data.select(range(num_to_select))  # Select up to 'num_to_select' rows
    filtered_data = filtered_data.select(range(min(top_n, len(filtered_data))))


    # Create a list to store the relevant data
    relevant_documents = []

    # Display and store an excerpt of the answer for each relevant entry
    for entry in filtered_data:
        # Check the type of 'entry' first to ensure it's a dictionary
        if isinstance(entry, dict):
            question = entry.get('question', '')  # Accessing the 'question' field safely
            answer = entry.get('answer', '')  # Accessing the 'answer' field safely
            text = entry.get('text', '')  # Accessing the 'text' field safely

            # Only store the first 'max_words' words of the answer or text
            answer_excerpt = ' '.join(answer.split()[:max_words]) if answer else ""
            text_excerpt = ' '.join(text.split()[:max_words]) if text else ""

            # Append relevant document information to the list
            relevant_documents.append({
                "question": question,
                "answer": answer_excerpt,
                "text": text_excerpt
            })

            # Debugging: Print a preview of the data (optional)
            #print(f"Question: {question[:20]}...")  # Print first 20 chars of the question
            #print(f"Answer (first {max_words} words): {answer_excerpt[:20]}...")  # Print first 20 words of the answer
            #print(f"Text (first {max_words} words): {text_excerpt[:20]}...")  # Print first 20 words of the text
            #print("-" * 50)
        else:
            print("Unexpected entry format:", entry)

    return relevant_documents  # Return the list of relevant documents

# Sample search query
#relevant_data = search_relevant_data("vatican city")  # Change to the desired query/topic
#print(f"Found {len(relevant_data)} relevant documents.")