import logging import pickle from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Callable import numpy as np import pandas as pd from common.constants import UNKNOWN from components.embedding_extraction import EmbeddingExtractor logger = logging.getLogger(__name__) @dataclass class DatasetRow: """ Класс для хранения данных одной строки датасета. """ Index: int Text: str DocName: str Title: str DocNumber: str LevelParagraph: str = field(default=UNKNOWN) Pargaraph: str = field(default=UNKNOWN) Duplicate: str = field(default=UNKNOWN) PartLevel1: str = field(default=UNKNOWN) PartLevel2: str = field(default=UNKNOWN) Appendix: str = field(default=UNKNOWN) LevelParagraphAppendix: str = field(default=UNKNOWN) PargaraphAppendix: str = field(default=UNKNOWN) DuplicateAppendix: str = field(default=UNKNOWN) PartLevel1Appendix: str = field(default=UNKNOWN) Table: str = field(default=UNKNOWN) class DocumentsDataset: """ Класс для хранения данных датасета. Содержит список строк и векторы текстов. Изначально не содержит векторов, чтобы запустить процесс векторизации, нужно вызвать метод vectorize_with. """ def __init__(self, rows: list[DatasetRow]): self.rows = rows self.vectors: np.ndarray | None = None def vectorize_with( self, vectorizer: EmbeddingExtractor, progress_callback: Callable[[int, int], None] | None = None, ) -> None: """ Векторизация текстов в датасете. """ logger.info('Starting dataset vectorization') total = len(self.rows) rows = [row.Text for row in self.rows] vectors = vectorizer.vectorize(rows, progress_callback) self.vectors = vectors logger.info(f'Completed vectorization of {total} rows') def to_pandas(self) -> pd.DataFrame: """ Преобразовать датасет в pandas DataFrame. Returns: pd.DataFrame: Датафрейм с данными. """ df = pd.DataFrame([asdict(row) for row in self.rows]) if self.vectors is not None: df['Embedding'] = self.vectors.tolist() else: df['Embedding'] = np.nan return df def to_pickle(self, path: Path) -> None: """ Сохранение датасета в pickle файл. """ logger.info(f'Saving dataset to {path}') with open(path, 'wb') as f: pickle.dump(self.to_pandas(), f) logger.info('Dataset saved successfully') @classmethod def from_pickle(cls, path: Path) -> 'DocumentsDataset': """ Загрузка датасета из pickle файла. """ logger.info(f'Loading dataset from {path}') try: with open(path, 'rb') as f: dataset = pickle.load(f) logger.info(f'Loaded dataset with {len(dataset.rows)} rows') return dataset except Exception as e: logger.error(f'Failed to load dataset: {e}') raise