Spaces:
Sleeping
Sleeping
import logging | |
import pandas as pd | |
from sqlalchemy.orm import Session | |
from components.dbo.models.acronym import Acronym | |
from components.dbo.models.dataset import Dataset | |
from components.dbo.models.dataset_document import DatasetDocument | |
from schemas.acronym import AcronymCollectionResponse | |
logger = logging.getLogger(__name__) | |
class AcronymService: | |
""" | |
Сервис для работы с аббревиатурами и сокращениями. | |
""" | |
def __init__(self, db: Session): | |
logger.info("Initializing AcronymService") | |
self.db = db | |
def from_pandas(self, df: pd.DataFrame) -> None: | |
""" | |
Загрузить аббревиатуры и сокращения из pandas DataFrame. | |
Args: | |
df: DataFrame со столбцами document_id, short_form, full_form, type | |
""" | |
logger.info(f"Loading acronyms from DataFrame with {len(df)} rows") | |
with self.db() as session: | |
try: | |
# Process each row in the DataFrame | |
for _, row in df.iterrows(): | |
# Create acronym | |
acronym = Acronym( | |
short_form=row['short_form'], | |
full_form=row['full_form'], | |
type=row['type'], | |
document_id=( | |
int(row['document_id']) | |
if pd.notna(row['document_id']) | |
else None | |
), | |
) | |
session.add(acronym) | |
session.commit() | |
logger.info("Successfully loaded all acronyms") | |
except Exception as e: | |
session.rollback() | |
logger.error(f"Error processing acronyms: {str(e)}") | |
raise e | |
finally: | |
session.close() | |
def get_abbreviations(self, document_id: int) -> list[Acronym]: | |
""" | |
Получить аббревиатуры и сокращения для документа. | |
""" | |
logger.info(f"Getting abbreviations for document {document_id}") | |
with self.db() as session: | |
result = ( | |
session.query(Acronym) | |
.filter( | |
(Acronym.document_id == document_id) | (Acronym.document_id == None) | |
) | |
.all() | |
) | |
logger.debug(f"Found {len(result)} abbreviations for document {document_id}") | |
return result | |
def get_abbreviations_by_dataset_id(self, dataset_id: int) -> list[Acronym]: | |
""" | |
Получить аббревиатуры и сокращения для документа. | |
""" | |
logger.info(f"Getting abbreviations for dataset {dataset_id}") | |
return self._get_acronyms_for_dataset(dataset_id) | |
def get_current_acronyms(self) -> AcronymCollectionResponse: | |
""" | |
Получить аббревиатуры и сокращения для текущего активного набора данных. | |
""" | |
logger.info("Getting acronyms for current active dataset") | |
with self.db() as session: | |
active_dataset: Dataset = session.query(Dataset).filter(Dataset.is_active == True).first() | |
if not active_dataset: | |
logger.warning("No active dataset found") | |
return AcronymCollectionResponse( | |
collection_id=0, | |
collection_name="", | |
collection_filename="", | |
updated_at=None, | |
acronyms={}, | |
) | |
result = self._get_acronyms_for_dataset(active_dataset.id) | |
return AcronymCollectionResponse( | |
collection_id=active_dataset.id, | |
collection_name=active_dataset.name, | |
collection_filename='', | |
updated_at=active_dataset.date_created, #TODO: Что? | |
acronyms=self._compress_acronyms(result), | |
) | |
def _get_acronyms_for_dataset(self, dataset_id: int) -> list[Acronym]: | |
""" | |
Получить список акронимов для датасета. | |
Args: | |
dataset_id: ID датасета | |
Returns: | |
list[Acronym]: Список акронимов | |
""" | |
with self.db() as session: | |
try: | |
document_ids = ( | |
session.query(DatasetDocument.document_id) | |
.filter(DatasetDocument.id == dataset_id) | |
.all() | |
) | |
result = ( | |
session.query(Acronym) | |
.filter( | |
(Acronym.document_id.in_([doc_id[0] for doc_id in document_ids])) | (Acronym.document_id == None) | |
) | |
.all() | |
) | |
logger.debug(f"Found {len(result)} acronyms for dataset {dataset_id}") | |
return result | |
finally: | |
pass | |
def _compress_acronyms(self, acronyms: list[Acronym]) -> dict[str, list[str]]: | |
""" | |
Сжать аббревиатуры и сокращения в словарь. | |
""" | |
short_forms = {acronym.short_form for acronym in acronyms} | |
compressed = { | |
short_form: [ | |
acronym.full_form | |
for acronym in acronyms | |
if acronym.short_form == short_form | |
] | |
for short_form in short_forms | |
} | |
logger.debug(f"Compressed {len(acronyms)} acronyms into {len(compressed)} unique short forms") | |
return compressed | |