test / database /query_processor.py
christopher
changed nlp and query processors to fix issues with lists
c8d57fb
raw
history blame
3.48 kB
import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from models.LexRank import degree_centrality_scores
import logging
logger = logging.getLogger(__name__)
class QueryProcessor:
def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
self.embedding_model = embedding_model
self.summarization_model = summarization_model
self.nlp_model = nlp_model
self.db_service = db_service
async def process(
self,
query: str,
topic: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None
) -> Dict[str, Any]:
try:
# Convert string dates to datetime objects
start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None
# Get query embedding
query_embedding = self.embedding_model.encode(query).tolist()
logger.debug(f"Generated query embedding for: {query[:50]}...")
# Extract entities using the NLP model
entities = self.nlp_model.extract_entities(query) # Changed from direct call to using method
logger.debug(f"Extracted entities: {entities}")
# Semantic search with entities
articles = await self.db_service.semantic_search(
query_embedding=query_embedding,
start_date=start_dt,
end_date=end_dt,
topic=topic,
entities=[ent[0] for ent in entities] # Using just the entity texts
)
if not articles:
logger.info("No articles found matching search criteria")
return {"error": "No articles found matching the criteria"}
# Process results
contents = [article["content"] for article in articles]
sentences = []
for content in contents:
sentences.extend(self.nlp_model.tokenize_sentences(content))
logger.debug(f"Processing {len(sentences)} sentences for summarization")
# Generate summary
if sentences:
embeddings = self.embedding_model.encode(sentences)
similarity_matrix = np.inner(embeddings, embeddings)
centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
top_indices = np.argsort(-centrality_scores)[0:10]
key_sentences = [sentences[idx].strip() for idx in top_indices]
combined_text = ' '.join(key_sentences)
summary = self.summarization_model.summarize(combined_text)
logger.debug(f"Generated summary with {len(key_sentences)} key sentences")
else:
key_sentences = []
summary = "No content available for summarization"
logger.warning("No sentences available for summarization")
return {
"summary": summary,
"articles": articles,
"entities": entities # Include extracted entities in response
}
except Exception as e:
logger.error(f"Error in QueryProcessor: {str(e)}", exc_info=True)
return {"error": f"Processing error: {str(e)}"}