Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| # Load the dataset (specify split as 'train' to load the training data) | |
| dataset = load_dataset('tom-010/google_natural_questions_answerability', split='train') | |
| # Function to filter based on a query/topic and return relevant data | |
| def search_relevant_data(topic="Artificial Intelligence", max_words=100, top_n=100): | |
| # Filter the dataset based on the presence of the topic in 'question', 'answer', or 'text' fields | |
| filtered_data = dataset.filter( | |
| lambda x: ( | |
| (x['question'] is not None and topic.lower() in x['question'].lower()) or | |
| (x['answer'] is not None and topic.lower() in x['answer'].lower()) or | |
| (x['text'] is not None and topic.lower() in x['text'].lower()) | |
| ) | |
| ) | |
| # Ensure we only select up to the available number of rows | |
| #num_to_select = min(top_n, len(filtered_data)) # Choose the minimum of top_n and available data | |
| #filtered_data = filtered_data.select(range(num_to_select)) # Select up to 'num_to_select' rows | |
| filtered_data = filtered_data.select(range(min(top_n, len(filtered_data)))) | |
| # Create a list to store the relevant data | |
| relevant_documents = [] | |
| # Display and store an excerpt of the answer for each relevant entry | |
| for entry in filtered_data: | |
| # Check the type of 'entry' first to ensure it's a dictionary | |
| if isinstance(entry, dict): | |
| question = entry.get('question', '') # Accessing the 'question' field safely | |
| answer = entry.get('answer', '') # Accessing the 'answer' field safely | |
| text = entry.get('text', '') # Accessing the 'text' field safely | |
| # Only store the first 'max_words' words of the answer or text | |
| answer_excerpt = ' '.join(answer.split()[:max_words]) if answer else "" | |
| text_excerpt = ' '.join(text.split()[:max_words]) if text else "" | |
| # Append relevant document information to the list | |
| relevant_documents.append({ | |
| "question": question, | |
| "answer": answer_excerpt, | |
| "text": text_excerpt | |
| }) | |
| # Debugging: Print a preview of the data (optional) | |
| #print(f"Question: {question[:20]}...") # Print first 20 chars of the question | |
| #print(f"Answer (first {max_words} words): {answer_excerpt[:20]}...") # Print first 20 words of the answer | |
| #print(f"Text (first {max_words} words): {text_excerpt[:20]}...") # Print first 20 words of the text | |
| #print("-" * 50) | |
| else: | |
| print("Unexpected entry format:", entry) | |
| return relevant_documents # Return the list of relevant documents | |
| # Sample search query | |
| #relevant_data = search_relevant_data("vatican city") # Change to the desired query/topic | |
| #print(f"Found {len(relevant_data)} relevant documents.") | |