Spaces:

0edon
/

test

Paused

test / database /query_processor.py

christopher

changed nlp and query processors to fix issues with lists

c8d57fb 3 months ago

3.48 kB

	import datetime
	from typing import List, Dict, Any, Optional
	import numpy as np
	from models.LexRank import degree_centrality_scores
	import logging

	logger = logging.getLogger(__name__)

	class QueryProcessor:
	def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
	self.embedding_model = embedding_model
	self.summarization_model = summarization_model
	self.nlp_model = nlp_model
	self.db_service = db_service

	async def process(
	self,
	query: str,
	topic: Optional[str] = None,
	start_date: Optional[str] = None,
	end_date: Optional[str] = None
	) -> Dict[str, Any]:
	try:
	# Convert string dates to datetime objects
	start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
	end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None

	# Get query embedding
	query_embedding = self.embedding_model.encode(query).tolist()
	logger.debug(f"Generated query embedding for: {query[:50]}...")

	# Extract entities using the NLP model
	entities = self.nlp_model.extract_entities(query) # Changed from direct call to using method
	logger.debug(f"Extracted entities: {entities}")

	# Semantic search with entities
	articles = await self.db_service.semantic_search(
	query_embedding=query_embedding,
	start_date=start_dt,
	end_date=end_dt,
	topic=topic,
	entities=[ent[0] for ent in entities] # Using just the entity texts
	)

	if not articles:
	logger.info("No articles found matching search criteria")
	return {"error": "No articles found matching the criteria"}

	# Process results
	contents = [article["content"] for article in articles]
	sentences = []
	for content in contents:
	sentences.extend(self.nlp_model.tokenize_sentences(content))

	logger.debug(f"Processing {len(sentences)} sentences for summarization")

	# Generate summary
	if sentences:
	embeddings = self.embedding_model.encode(sentences)
	similarity_matrix = np.inner(embeddings, embeddings)
	centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)

	top_indices = np.argsort(-centrality_scores)[0:10]
	key_sentences = [sentences[idx].strip() for idx in top_indices]
	combined_text = ' '.join(key_sentences)

	summary = self.summarization_model.summarize(combined_text)
	logger.debug(f"Generated summary with {len(key_sentences)} key sentences")
	else:
	key_sentences = []
	summary = "No content available for summarization"
	logger.warning("No sentences available for summarization")

	return {
	"summary": summary,
	"articles": articles,
	"entities": entities # Include extracted entities in response
	}

	except Exception as e:
	logger.error(f"Error in QueryProcessor: {str(e)}", exc_info=True)
	return {"error": f"Processing error: {str(e)}"}