muryshev's picture
init
57cf043
raw
history blame
5.78 kB
import logging
import pandas as pd
from sqlalchemy.orm import Session
from components.dbo.models.acronym import Acronym
from components.dbo.models.dataset import Dataset
from components.dbo.models.dataset_document import DatasetDocument
from schemas.acronym import AcronymCollectionResponse
logger = logging.getLogger(__name__)
class AcronymService:
"""
Сервис для работы с аббревиатурами и сокращениями.
"""
def __init__(self, db: Session):
logger.info("Initializing AcronymService")
self.db = db
def from_pandas(self, df: pd.DataFrame) -> None:
"""
Загрузить аббревиатуры и сокращения из pandas DataFrame.
Args:
df: DataFrame со столбцами document_id, short_form, full_form, type
"""
logger.info(f"Loading acronyms from DataFrame with {len(df)} rows")
with self.db() as session:
try:
# Process each row in the DataFrame
for _, row in df.iterrows():
# Create acronym
acronym = Acronym(
short_form=row['short_form'],
full_form=row['full_form'],
type=row['type'],
document_id=(
int(row['document_id'])
if pd.notna(row['document_id'])
else None
),
)
session.add(acronym)
session.commit()
logger.info("Successfully loaded all acronyms")
except Exception as e:
session.rollback()
logger.error(f"Error processing acronyms: {str(e)}")
raise e
finally:
session.close()
def get_abbreviations(self, document_id: int) -> list[Acronym]:
"""
Получить аббревиатуры и сокращения для документа.
"""
logger.info(f"Getting abbreviations for document {document_id}")
with self.db() as session:
result = (
session.query(Acronym)
.filter(
(Acronym.document_id == document_id) | (Acronym.document_id == None)
)
.all()
)
logger.debug(f"Found {len(result)} abbreviations for document {document_id}")
return result
def get_abbreviations_by_dataset_id(self, dataset_id: int) -> list[Acronym]:
"""
Получить аббревиатуры и сокращения для документа.
"""
logger.info(f"Getting abbreviations for dataset {dataset_id}")
return self._get_acronyms_for_dataset(dataset_id)
def get_current_acronyms(self) -> AcronymCollectionResponse:
"""
Получить аббревиатуры и сокращения для текущего активного набора данных.
"""
logger.info("Getting acronyms for current active dataset")
with self.db() as session:
active_dataset: Dataset = session.query(Dataset).filter(Dataset.is_active == True).first()
if not active_dataset:
logger.warning("No active dataset found")
return AcronymCollectionResponse(
collection_id=0,
collection_name="",
collection_filename="",
updated_at=None,
acronyms={},
)
result = self._get_acronyms_for_dataset(active_dataset.id)
return AcronymCollectionResponse(
collection_id=active_dataset.id,
collection_name=active_dataset.name,
collection_filename='',
updated_at=active_dataset.date_created, #TODO: Что?
acronyms=self._compress_acronyms(result),
)
def _get_acronyms_for_dataset(self, dataset_id: int) -> list[Acronym]:
"""
Получить список акронимов для датасета.
Args:
dataset_id: ID датасета
Returns:
list[Acronym]: Список акронимов
"""
with self.db() as session:
try:
document_ids = (
session.query(DatasetDocument.document_id)
.filter(DatasetDocument.id == dataset_id)
.all()
)
result = (
session.query(Acronym)
.filter(
(Acronym.document_id.in_([doc_id[0] for doc_id in document_ids])) | (Acronym.document_id == None)
)
.all()
)
logger.debug(f"Found {len(result)} acronyms for dataset {dataset_id}")
return result
finally:
pass
def _compress_acronyms(self, acronyms: list[Acronym]) -> dict[str, list[str]]:
"""
Сжать аббревиатуры и сокращения в словарь.
"""
short_forms = {acronym.short_form for acronym in acronyms}
compressed = {
short_form: [
acronym.full_form
for acronym in acronyms
if acronym.short_form == short_form
]
for short_form in short_forms
}
logger.debug(f"Compressed {len(acronyms)} acronyms into {len(compressed)} unique short forms")
return compressed