File size: 6,886 Bytes
57cf043
 
 
 
 
 
86c402d
57cf043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86c402d
57cf043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08bb8bf
57cf043
 
 
 
 
 
 
 
 
 
86c402d
57cf043
 
86c402d
57cf043
 
 
 
 
 
 
 
 
 
 
 
 
86c402d
57cf043
86c402d
 
57cf043
 
 
 
 
 
 
 
86c402d
57cf043
 
 
86c402d
57cf043
 
 
 
 
86c402d
57cf043
 
 
86c402d
57cf043
 
 
 
 
 
 
 
 
 
 
 
86c402d
57cf043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import logging
import os
import shutil
from pathlib import Path

from fastapi import HTTPException, UploadFile
from ntr_fileparser import UniversalParser

from sqlalchemy.orm import Session
from common.common import get_source_format
from common.configuration import Configuration
from common.constants import PROCESSING_FORMATS
from components.dbo.models.dataset import Dataset
from components.dbo.models.dataset_document import DatasetDocument
from components.dbo.models.document import Document
from schemas.document import Document as DocumentSchema
from schemas.document import DocumentDownload
from components.services.dataset import DatasetService
logger = logging.getLogger(__name__)


class DocumentService:
    """
    Сервис для работы с документами.
    """

    def __init__(
        self,
        dataset_service: DatasetService,
        config: Configuration,
        db: Session
    ):
        logger.info("Initializing DocumentService")
        self.db = db
        self.dataset_service = dataset_service
        self.parser = UniversalParser()
        self.documents_path = Path(config.db_config.files.documents_path)

    def get_document(
        self,
        document_id: int,
        dataset_id: int | None = None,
    ) -> DocumentDownload:
        """
        Скачать документ по его идентификатору.
        """
        logger.info(f"Getting document info for ID: {document_id}")
        if dataset_id is None:
            dataset_id = self.dataset_service.get_current_dataset().dataset_id

        self.dataset_service.raise_if_processing()

        with self.db() as session:
            document_in_dataset = (
                session.query(DatasetDocument)
                .filter(
                    DatasetDocument.dataset_id == dataset_id,
                    DatasetDocument.document_id == document_id,
                )
                .first()
            )

            if not document_in_dataset:
                logger.warning(f"Document not found: {document_id}")
                raise HTTPException(status_code=404, detail="Document not found")

            document = (
                session.query(Document)
                .filter(
                    Document.id == document_id,
                )
                .first()
            )


        result = DocumentDownload(
            filename=f'{document.title[:40]}.{document.source_format}',
            filepath=self.documents_path
            / f'{document.document_id}.{document.source_format}',
        )

        logger.debug(f"Retrieved document: {result.filename}")
        return result

    def add_document(self, dataset_id: int, file: UploadFile) -> DocumentSchema:
        """
        Добавить документ в датасет.
        """

        self.dataset_service.raise_if_processing()

        file_location = Path(os.environ.get("APP_TMP_PATH", '.')) / 'tmp' / file.filename
        file_location.parent.mkdir(parents=True, exist_ok=True)
        with open(file_location, 'wb') as buffer:
            buffer.write(file.file.read())

        source_format = get_source_format(file.filename)

        logger.info(f"Parsing file: {file_location}")
        logger.info(f"Source format: {source_format}")

        try:
            parsed = self.parser.parse_by_path(str(file_location))
        except Exception:
            raise HTTPException(
                status_code=400, detail="Invalid file, service can't parse it"
            )

        with self.db() as session:
            dataset = (
                session.query(Dataset).filter(Dataset.id == dataset_id).first()
            )
            if not dataset:
                raise HTTPException(status_code=404, detail='Dataset not found')

            if not dataset.is_draft:
                raise HTTPException(status_code=403, detail='Dataset is not draft')

            document = Document(
                filename=file.filename,
                title=parsed.name,
                owner=parsed.meta.owner,
                status=parsed.meta.status,
                source_format=source_format,
            )

            logger.info(f"Document: {document}")

            session.add(document)
            session.flush()

            logger.info(f"Document ID: {document.id}")

            link = DatasetDocument(
                dataset_id=dataset_id,
                document_id=document.id,
            )
            session.add(link)

            if source_format in PROCESSING_FORMATS:
                logger.info(
                    f"Moving file to: {self.documents_path / f'{document.id}.{source_format}'}"
                )
                shutil.move(
                    file_location,
                    self.documents_path / f'{document.id}.{source_format}',
                )
            else:
                logger.error(f"Unknown source format: {source_format}")
                raise HTTPException(status_code=400, detail='Unknown document format')

            if len(os.listdir(file_location.parent)) == 0:
                file_location.parent.rmdir()

            session.commit()
            session.refresh(document)

        result = DocumentSchema(
            id=document.id,
            name=document.title,
            owner=document.owner,
            status=document.status,
        )
        logger.debug(f"Retrieved document: {result.name}")
        return result

    def delete_document(self, dataset_id: int, document_id: int) -> None:
        """
        Удалить документ из датасета.
        """

        self.dataset_service.raise_if_processing()

        with self.db() as session:
            dataset_document = (
                session.query(DatasetDocument)
                .filter(
                    DatasetDocument.dataset_id == dataset_id,
                    DatasetDocument.document_id == document_id,
                )
                .first()
            )

            if not dataset_document:
                raise HTTPException(status_code=404, detail='Document not found')

            dataset = (
                session.query(Dataset).filter(Dataset.id == dataset_id).first()
            )

            if not dataset.is_draft:
                raise HTTPException(status_code=403, detail='Dataset is not draft')

            document = (
                session.query(Document).filter(Document.id == document_id).first()
            )
            is_used = (
                session.query(DatasetDocument)
                .filter(DatasetDocument.document_id == document_id)
                .count()
            )
            if is_used == 0:
                os.remove(self.documents_path / f'{document_id}.{document.source_format}')
                session.delete(document)

            session.delete(dataset_document)
            session.commit()