Spaces:

Adnan-AI-Labs
/

QA-ContextRetriever

Sleeping

QA-ContextRetriever / data_ret.py

Muhammad Adnan

Initial commit of Streamlit app

ae6eb20 about 1 year ago

2.86 kB

	from datasets import load_dataset

	# Load the dataset (specify split as 'train' to load the training data)
	dataset = load_dataset('tom-010/google_natural_questions_answerability', split='train')

	# Function to filter based on a query/topic and return relevant data
	def search_relevant_data(topic="Artificial Intelligence", max_words=100, top_n=100):
	# Filter the dataset based on the presence of the topic in 'question', 'answer', or 'text' fields
	filtered_data = dataset.filter(
	lambda x: (
	(x['question'] is not None and topic.lower() in x['question'].lower()) or
	(x['answer'] is not None and topic.lower() in x['answer'].lower()) or
	(x['text'] is not None and topic.lower() in x['text'].lower())
	)
	)

	# Ensure we only select up to the available number of rows
	#num_to_select = min(top_n, len(filtered_data)) # Choose the minimum of top_n and available data
	#filtered_data = filtered_data.select(range(num_to_select)) # Select up to 'num_to_select' rows
	filtered_data = filtered_data.select(range(min(top_n, len(filtered_data))))


	# Create a list to store the relevant data
	relevant_documents = []

	# Display and store an excerpt of the answer for each relevant entry
	for entry in filtered_data:
	# Check the type of 'entry' first to ensure it's a dictionary
	if isinstance(entry, dict):
	question = entry.get('question', '') # Accessing the 'question' field safely
	answer = entry.get('answer', '') # Accessing the 'answer' field safely
	text = entry.get('text', '') # Accessing the 'text' field safely

	# Only store the first 'max_words' words of the answer or text
	answer_excerpt = ' '.join(answer.split()[:max_words]) if answer else ""
	text_excerpt = ' '.join(text.split()[:max_words]) if text else ""

	# Append relevant document information to the list
	relevant_documents.append({
	"question": question,
	"answer": answer_excerpt,
	"text": text_excerpt
	})

	# Debugging: Print a preview of the data (optional)
	#print(f"Question: {question[:20]}...") # Print first 20 chars of the question
	#print(f"Answer (first {max_words} words): {answer_excerpt[:20]}...") # Print first 20 words of the answer
	#print(f"Text (first {max_words} words): {text_excerpt[:20]}...") # Print first 20 words of the text
	#print("-" * 50)
	else:
	print("Unexpected entry format:", entry)

	return relevant_documents # Return the list of relevant documents

	# Sample search query
	#relevant_data = search_relevant_data("vatican city") # Change to the desired query/topic
	#print(f"Found {len(relevant_data)} relevant documents.")