Spaces:
Runtime error
Runtime error
Carlos Salgado
commited on
Commit
·
e39bb0b
1
Parent(s):
c40d04b
fallback on pypdf, trim flake, minor ux
Browse files- app.py +15 -15
- flake.nix +4 -18
- requirements.txt +2 -2
- scripts.py +33 -29
app.py
CHANGED
|
@@ -13,17 +13,17 @@ def suggest_metadata(file_upload):
|
|
| 13 |
|
| 14 |
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
| 15 |
tmp.write(uploaded_file.read())
|
| 16 |
-
|
| 17 |
-
st.write(f'Created temporary file {file_path}')
|
| 18 |
|
| 19 |
-
st.write('##
|
| 20 |
-
|
| 21 |
-
|
|
|
|
| 22 |
|
|
|
|
| 23 |
st.write('## Querying Together.ai API')
|
| 24 |
-
|
| 25 |
-
st.write(f'
|
| 26 |
-
st.write(f'### {metadata}')
|
| 27 |
|
| 28 |
with st.form('analyze_form'):
|
| 29 |
st.write('Enter your file metadata in the following schema:')
|
|
@@ -38,14 +38,14 @@ with st.form('analyze_form'):
|
|
| 38 |
analysis = analyze_metadata(filename, description, discipline)
|
| 39 |
|
| 40 |
st.write(analysis)
|
|
|
|
| 41 |
|
| 42 |
st.write('## Generate metadata?')
|
| 43 |
-
uploaded_file = st.file_uploader("Choose a PDF file", type=
|
| 44 |
|
| 45 |
-
if uploaded_file is not None:
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
os.remove(file_path)
|
|
|
|
| 13 |
|
| 14 |
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
| 15 |
tmp.write(uploaded_file.read())
|
| 16 |
+
st.write(f'Created temporary file {tmp.name}')
|
|
|
|
| 17 |
|
| 18 |
+
st.write('## Ingesting Unstructured file')
|
| 19 |
+
|
| 20 |
+
docs = ingest(tmp.name)
|
| 21 |
+
print(f'Ingested {tmp.name}')
|
| 22 |
|
| 23 |
+
metadata = generate_metadata(docs)
|
| 24 |
st.write('## Querying Together.ai API')
|
| 25 |
+
st.write(f'### Suggested Metadata Generated by {MODEL_NAME}')
|
| 26 |
+
st.write(f'#### {metadata}')
|
|
|
|
| 27 |
|
| 28 |
with st.form('analyze_form'):
|
| 29 |
st.write('Enter your file metadata in the following schema:')
|
|
|
|
| 38 |
analysis = analyze_metadata(filename, description, discipline)
|
| 39 |
|
| 40 |
st.write(analysis)
|
| 41 |
+
submitted = None
|
| 42 |
|
| 43 |
st.write('## Generate metadata?')
|
| 44 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
| 45 |
|
| 46 |
+
if uploaded_file is not None:
|
| 47 |
|
| 48 |
+
query_api = st.button('Query API')
|
| 49 |
+
if query_api:
|
| 50 |
+
suggest_metadata(uploaded_file)
|
| 51 |
+
query_api = None
|
|
|
flake.nix
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
description = "A LLM backend development flake powered by unstructured and langchain";
|
| 3 |
-
|
| 4 |
inputs = {
|
| 5 |
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
|
| 6 |
};
|
|
@@ -9,6 +9,7 @@
|
|
| 9 |
system = "x86_64-linux";
|
| 10 |
# ↑ Swap it for your system if needed
|
| 11 |
# "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
|
|
|
|
| 12 |
pkgs = nixpkgs.legacyPackages.${system};
|
| 13 |
in {
|
| 14 |
devShells.${system}.default = pkgs.mkShell {
|
|
@@ -17,33 +18,18 @@
|
|
| 17 |
python-pkgs.pip # VsCode starts
|
| 18 |
python-pkgs.jupyter
|
| 19 |
python-pkgs.notebook # VsCode ends
|
| 20 |
-
python-pkgs.numpy
|
| 21 |
python-pkgs.pandas
|
| 22 |
-
python-pkgs.scipy
|
| 23 |
-
python-pkgs.matplotlib
|
| 24 |
python-pkgs.requests
|
| 25 |
python-pkgs.langchain-community
|
| 26 |
python-pkgs.langchain
|
| 27 |
python-pkgs.langchain-text-splitters
|
| 28 |
-
python-pkgs.
|
| 29 |
-
python-pkgs.wrapt # unstructured[local-inference] starts
|
| 30 |
-
python-pkgs.iso-639
|
| 31 |
-
python-pkgs.emoji
|
| 32 |
-
python-pkgs.pillow-heif
|
| 33 |
-
python-pkgs.magic
|
| 34 |
-
python-pkgs.poppler-qt5
|
| 35 |
-
python-pkgs.pytesseract
|
| 36 |
-
python-pkgs.langdetect # unstructured[local-inference] ends
|
| 37 |
python-pkgs.openai
|
| 38 |
-
python-pkgs.pydantic
|
| 39 |
python-pkgs.python-dotenv
|
| 40 |
python-pkgs.configargparse
|
| 41 |
python-pkgs.streamlit
|
| 42 |
-
python-pkgs.lark
|
| 43 |
python-pkgs.sentence-transformers
|
| 44 |
-
pkgs.unstructured
|
| 45 |
-
pkgs.poppler
|
| 46 |
-
pkgs.haskellPackages.iso639
|
| 47 |
]))
|
| 48 |
];
|
| 49 |
|
|
|
|
| 1 |
{
|
| 2 |
description = "A LLM backend development flake powered by unstructured and langchain";
|
| 3 |
+
|
| 4 |
inputs = {
|
| 5 |
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
|
| 6 |
};
|
|
|
|
| 9 |
system = "x86_64-linux";
|
| 10 |
# ↑ Swap it for your system if needed
|
| 11 |
# "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
|
| 12 |
+
debug = true;
|
| 13 |
pkgs = nixpkgs.legacyPackages.${system};
|
| 14 |
in {
|
| 15 |
devShells.${system}.default = pkgs.mkShell {
|
|
|
|
| 18 |
python-pkgs.pip # VsCode starts
|
| 19 |
python-pkgs.jupyter
|
| 20 |
python-pkgs.notebook # VsCode ends
|
|
|
|
| 21 |
python-pkgs.pandas
|
|
|
|
|
|
|
| 22 |
python-pkgs.requests
|
| 23 |
python-pkgs.langchain-community
|
| 24 |
python-pkgs.langchain
|
| 25 |
python-pkgs.langchain-text-splitters
|
| 26 |
+
python-pkgs.pypdf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
python-pkgs.openai
|
|
|
|
| 28 |
python-pkgs.python-dotenv
|
| 29 |
python-pkgs.configargparse
|
| 30 |
python-pkgs.streamlit
|
|
|
|
| 31 |
python-pkgs.sentence-transformers
|
| 32 |
+
python-pkgs.unstructured
|
|
|
|
|
|
|
| 33 |
]))
|
| 34 |
];
|
| 35 |
|
requirements.txt
CHANGED
|
@@ -7,5 +7,5 @@ streamlit
|
|
| 7 |
python-dotenv
|
| 8 |
sentence-transformers
|
| 9 |
iso639-lang
|
| 10 |
-
|
| 11 |
-
|
|
|
|
| 7 |
python-dotenv
|
| 8 |
sentence-transformers
|
| 9 |
iso639-lang
|
| 10 |
+
unstructured[pdf]
|
| 11 |
+
pypdf
|
scripts.py
CHANGED
|
@@ -5,8 +5,11 @@ import json
|
|
| 5 |
import openai
|
| 6 |
import sys
|
| 7 |
from dotenv import load_dotenv
|
|
|
|
| 8 |
from langchain_community.document_loaders import TextLoader
|
|
|
|
| 9 |
from langchain_community.document_loaders import UnstructuredPDFLoader
|
|
|
|
| 10 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 11 |
from langchain_community.vectorstores import Vectara
|
| 12 |
from langchain_core.output_parsers import StrOutputParser
|
|
@@ -56,35 +59,35 @@ def get_sources(documents):
|
|
| 56 |
def get_summary(documents):
|
| 57 |
return documents[-1].page_content
|
| 58 |
|
| 59 |
-
def ingest(file_path):
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
loader = UnstructuredPDFLoader(file_path)
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
return docs
|
| 87 |
-
|
| 88 |
|
| 89 |
|
| 90 |
def generate_metadata(docs):
|
|
@@ -126,8 +129,9 @@ def generate_metadata(docs):
|
|
| 126 |
}
|
| 127 |
]
|
| 128 |
)
|
|
|
|
| 129 |
|
| 130 |
-
return json.loads(chat_completion.choices[0].message.content)
|
| 131 |
|
| 132 |
|
| 133 |
def analyze_metadata(filename, description, discipline):
|
|
|
|
| 5 |
import openai
|
| 6 |
import sys
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
from langchain_community.document_loaders import TextLoader
|
| 10 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 11 |
from langchain_community.document_loaders import UnstructuredPDFLoader
|
| 12 |
+
|
| 13 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 14 |
from langchain_community.vectorstores import Vectara
|
| 15 |
from langchain_core.output_parsers import StrOutputParser
|
|
|
|
| 59 |
def get_summary(documents):
|
| 60 |
return documents[-1].page_content
|
| 61 |
|
| 62 |
+
def ingest(file_path):
|
| 63 |
+
try:
|
| 64 |
+
loader = PyPDFLoader(file_path)
|
| 65 |
+
documents = loader.load()
|
| 66 |
+
print('Loaded PyPDFLoader')
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f'{e}')
|
| 69 |
loader = UnstructuredPDFLoader(file_path)
|
| 70 |
+
documents = loader.load()
|
| 71 |
+
print('Loaded UnstructuredPDFLoader')
|
| 72 |
+
finally:
|
| 73 |
+
# transform locally
|
| 74 |
+
documents = loader.load()
|
| 75 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
|
| 76 |
+
separators=[
|
| 77 |
+
"\n\n",
|
| 78 |
+
"\n",
|
| 79 |
+
" ",
|
| 80 |
+
",",
|
| 81 |
+
"\uff0c", # Fullwidth comma
|
| 82 |
+
"\u3001", # Ideographic comma
|
| 83 |
+
"\uff0e", # Fullwidth full stop
|
| 84 |
+
# "\u200B", # Zero-width space (Asian languages)
|
| 85 |
+
# "\u3002", # Ideographic full stop (Asian languages)
|
| 86 |
+
"",
|
| 87 |
+
])
|
| 88 |
+
docs = text_splitter.split_documents(documents)
|
| 89 |
+
|
| 90 |
+
return docs
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
def generate_metadata(docs):
|
|
|
|
| 129 |
}
|
| 130 |
]
|
| 131 |
)
|
| 132 |
+
return chat_completion.choices[0].message.content
|
| 133 |
|
| 134 |
+
#return json.loads(chat_completion.choices[0].message.content)
|
| 135 |
|
| 136 |
|
| 137 |
def analyze_metadata(filename, description, discipline):
|