Spaces:

0edon
/

test

Paused

File size: 3,483 Bytes

28ec96b
 
a2682b3
28ec96b
c8d57fb
 
 
a2682b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8d57fb
 
 
 
 
 
 
 
a2682b3
c8d57fb
 
 
a2682b3
c8d57fb
 
 
 
 
 
 
 
 
 
 
 
a2682b3
c8d57fb
 
 
 
 
a2682b3
c8d57fb

import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from models.LexRank import degree_centrality_scores
import logging

logger = logging.getLogger(__name__)

class QueryProcessor:
    def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
        self.embedding_model = embedding_model
        self.summarization_model = summarization_model
        self.nlp_model = nlp_model
        self.db_service = db_service
    
    async def process(
        self,
        query: str,
        topic: Optional[str] = None,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None
    ) -> Dict[str, Any]:
        try:
            # Convert string dates to datetime objects
            start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
            end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None
            
            # Get query embedding
            query_embedding = self.embedding_model.encode(query).tolist()
            logger.debug(f"Generated query embedding for: {query[:50]}...")

            # Extract entities using the NLP model
            entities = self.nlp_model.extract_entities(query)  # Changed from direct call to using method
            logger.debug(f"Extracted entities: {entities}")

            # Semantic search with entities
            articles = await self.db_service.semantic_search(
                query_embedding=query_embedding,
                start_date=start_dt,
                end_date=end_dt,
                topic=topic,
                entities=[ent[0] for ent in entities]  # Using just the entity texts
            )
            
            if not articles:
                logger.info("No articles found matching search criteria")
                return {"error": "No articles found matching the criteria"}
            
            # Process results
            contents = [article["content"] for article in articles]
            sentences = []
            for content in contents:
                sentences.extend(self.nlp_model.tokenize_sentences(content))
            
            logger.debug(f"Processing {len(sentences)} sentences for summarization")

            # Generate summary
            if sentences:
                embeddings = self.embedding_model.encode(sentences)
                similarity_matrix = np.inner(embeddings, embeddings)
                centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
                
                top_indices = np.argsort(-centrality_scores)[0:10]
                key_sentences = [sentences[idx].strip() for idx in top_indices]
                combined_text = ' '.join(key_sentences)
                
                summary = self.summarization_model.summarize(combined_text)
                logger.debug(f"Generated summary with {len(key_sentences)} key sentences")
            else:
                key_sentences = []
                summary = "No content available for summarization"
                logger.warning("No sentences available for summarization")
            
            return {
                "summary": summary,
                "articles": articles,
                "entities": entities  # Include extracted entities in response
            }

        except Exception as e:
            logger.error(f"Error in QueryProcessor: {str(e)}", exc_info=True)
            return {"error": f"Processing error: {str(e)}"}