Remove all unstructured notations
Browse files
configs/test_full_workflow.yaml
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
# Full workflow
|
2 |
-
# Requires OpenAI API key and only works with table_extraction:Unstructured and r
|
3 |
-
|
4 |
-
pagefilter:
|
5 |
-
type: FromFilename
|
6 |
-
|
7 |
-
table_extraction:
|
8 |
-
- type: Unstructured
|
9 |
-
params:
|
10 |
-
pdf_image_dpi: 300
|
11 |
-
hi_res_model_name: "yolox"
|
12 |
-
|
13 |
-
table_cleaning:
|
14 |
-
- type: LLM
|
15 |
-
params:
|
16 |
-
openai_model: "gpt-4-turbo-preview"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/v0.yaml
CHANGED
@@ -10,11 +10,6 @@ table_extraction:
|
|
10 |
- type: Camelot
|
11 |
params:
|
12 |
flavor: lattice
|
13 |
-
- type: Unstructured
|
14 |
-
params:
|
15 |
-
hi_res_model_name: "yolox"
|
16 |
-
pdf_image_dpi: 300
|
17 |
# - type: LLamaParse
|
18 |
-
# - type: UnstructuredAPI
|
19 |
|
20 |
# table_cleaning:
|
|
|
10 |
- type: Camelot
|
11 |
params:
|
12 |
flavor: lattice
|
|
|
|
|
|
|
|
|
13 |
# - type: LLamaParse
|
|
|
14 |
|
15 |
# table_cleaning:
|
country_by_country/table_extraction/unstructured.py
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
# MIT License
|
2 |
-
#
|
3 |
-
# Copyright (c) 2024 dataforgood
|
4 |
-
#
|
5 |
-
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
-
# of this software and associated documentation files (the "Software"), to deal
|
7 |
-
# in the Software without restriction, including without limitation the rights
|
8 |
-
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
-
# copies of the Software, and to permit persons to whom the Software is
|
10 |
-
# furnished to do so, subject to the following conditions:
|
11 |
-
#
|
12 |
-
# The above copyright notice and this permission notice shall be included in all
|
13 |
-
# copies or substantial portions of the Software.
|
14 |
-
#
|
15 |
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
-
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
-
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
-
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
-
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
-
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
-
# SOFTWARE.
|
22 |
-
|
23 |
-
# Standard imports
|
24 |
-
import logging
|
25 |
-
import uuid
|
26 |
-
|
27 |
-
# External imports
|
28 |
-
from io import StringIO
|
29 |
-
|
30 |
-
import pandas as pd
|
31 |
-
from unstructured.partition.pdf import partition_pdf
|
32 |
-
|
33 |
-
|
34 |
-
class Unstructured:
|
35 |
-
def __init__(self, **kwargs: dict) -> dict:
|
36 |
-
"""
|
37 |
-
Builds a pdf page parser, looking for tables using
|
38 |
-
the unstructured library.
|
39 |
-
The kwargs given to the constructor are directly propagated
|
40 |
-
to the partition_pdf function.
|
41 |
-
You are free to define any parameter partition_pdf recognizes
|
42 |
-
"""
|
43 |
-
self.kwargs = kwargs
|
44 |
-
self.type = "unstructured"
|
45 |
-
|
46 |
-
def __call__(self, pdf_filepath: str) -> dict:
|
47 |
-
logging.info("\nKicking off extraction stage...")
|
48 |
-
logging.info(f"Extraction type: {self.type}, with params: {self.kwargs}")
|
49 |
-
|
50 |
-
elements = partition_pdf(
|
51 |
-
pdf_filepath,
|
52 |
-
infer_table_structure=True,
|
53 |
-
strategy="hi_res",
|
54 |
-
**self.kwargs,
|
55 |
-
)
|
56 |
-
tables_list = [el for el in elements if el.category == "Table"]
|
57 |
-
tables_list = [
|
58 |
-
pd.read_html(StringIO(t.metadata.text_as_html))[0] for t in tables_list
|
59 |
-
]
|
60 |
-
|
61 |
-
# Create asset
|
62 |
-
new_asset = {
|
63 |
-
"id": uuid.uuid4(),
|
64 |
-
"type": "unstructured",
|
65 |
-
"params": self.kwargs,
|
66 |
-
"tables": tables_list,
|
67 |
-
}
|
68 |
-
|
69 |
-
return new_asset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
country_by_country/table_extraction/unstructured_api.py
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
# MIT License
|
2 |
-
#
|
3 |
-
# Copyright (c) 2024 dataforgood
|
4 |
-
#
|
5 |
-
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
-
# of this software and associated documentation files (the "Software"), to deal
|
7 |
-
# in the Software without restriction, including without limitation the rights
|
8 |
-
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
-
# copies of the Software, and to permit persons to whom the Software is
|
10 |
-
# furnished to do so, subject to the following conditions:
|
11 |
-
#
|
12 |
-
# The above copyright notice and this permission notice shall be included in all
|
13 |
-
# copies or substantial portions of the Software.
|
14 |
-
#
|
15 |
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
-
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
-
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
-
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
-
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
-
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
-
# SOFTWARE.
|
22 |
-
|
23 |
-
# Standard imports
|
24 |
-
import logging
|
25 |
-
|
26 |
-
# External imports
|
27 |
-
import os
|
28 |
-
import uuid
|
29 |
-
from io import StringIO
|
30 |
-
from pathlib import Path
|
31 |
-
|
32 |
-
import pandas as pd
|
33 |
-
from unstructured_client import UnstructuredClient
|
34 |
-
from unstructured_client.models import shared
|
35 |
-
|
36 |
-
|
37 |
-
class UnstructuredAPI:
|
38 |
-
def __init__(self, **kwargs: dict) -> dict:
|
39 |
-
"""
|
40 |
-
Builds a pdf page parser, looking for tables using
|
41 |
-
the unstructured.io api.
|
42 |
-
The kwargs given to the constructor are directly propagated
|
43 |
-
to the partition_pdf function.
|
44 |
-
You are free to define any parameter partition_pdf recognizes
|
45 |
-
"""
|
46 |
-
self.kwargs = kwargs
|
47 |
-
self.type = "unstructured_api"
|
48 |
-
|
49 |
-
def __call__(self, pdf_filepath: str) -> dict:
|
50 |
-
logging.info("\nKicking off extraction stage...")
|
51 |
-
logging.info(f"Extraction type: {self.type}, with params: {self.kwargs}")
|
52 |
-
|
53 |
-
s = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"))
|
54 |
-
|
55 |
-
with Path(pdf_filepath).open("rb") as f:
|
56 |
-
# Note that this currently only supports a single file
|
57 |
-
files = shared.Files(
|
58 |
-
content=f.read(),
|
59 |
-
file_name=pdf_filepath,
|
60 |
-
)
|
61 |
-
|
62 |
-
req = shared.PartitionParameters(
|
63 |
-
files=files,
|
64 |
-
strategy="hi_res",
|
65 |
-
pdf_infer_table_structure="True",
|
66 |
-
**self.kwargs,
|
67 |
-
)
|
68 |
-
|
69 |
-
try:
|
70 |
-
resp = s.general.partition(req)
|
71 |
-
except Exception as e:
|
72 |
-
print(e)
|
73 |
-
else:
|
74 |
-
tables_list = []
|
75 |
-
for el in resp.elements:
|
76 |
-
if el["type"] == "Table":
|
77 |
-
# Enclose in try block to ignore case when pandas can't read the table
|
78 |
-
# Happens when the html is incorrectly formatted
|
79 |
-
try:
|
80 |
-
table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[
|
81 |
-
0
|
82 |
-
]
|
83 |
-
except Exception:
|
84 |
-
logging.info(
|
85 |
-
"Html table discarded. Pandas couldn't read the table.",
|
86 |
-
)
|
87 |
-
else:
|
88 |
-
tables_list.append(table)
|
89 |
-
|
90 |
-
# Create asset
|
91 |
-
new_asset = {
|
92 |
-
"id": uuid.uuid4(),
|
93 |
-
"type": "unstructured_api",
|
94 |
-
"params": self.kwargs,
|
95 |
-
"tables": tables_list,
|
96 |
-
}
|
97 |
-
|
98 |
-
return new_asset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|