txtrct
Browse files- app.py +3 -21
- requirements.txt +1 -3
app.py
CHANGED
|
@@ -2,12 +2,8 @@ import streamlit as st
|
|
| 2 |
import torch
|
| 3 |
import torch.nn.functional as F
|
| 4 |
from torch import Tensor
|
| 5 |
-
|
| 6 |
-
import tempfile
|
| 7 |
import textract
|
| 8 |
-
import docx2txt
|
| 9 |
-
import pdfplumber
|
| 10 |
-
import io
|
| 11 |
import os
|
| 12 |
|
| 13 |
def last_token_pool(last_hidden_states: Tensor,
|
|
@@ -46,23 +42,9 @@ click = st.button("Search")
|
|
| 46 |
|
| 47 |
|
| 48 |
|
| 49 |
-
def extract_text(doc):
|
| 50 |
-
if doc.type == 'text/plain':
|
| 51 |
-
return doc.read().decode('utf-8')
|
| 52 |
-
|
| 53 |
-
if doc.name.endswith(".pdf"):
|
| 54 |
-
docPath = save_upload(doc)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
with pdfplumber.open(docPath) as pdf:
|
| 58 |
-
pages = [page.extract_text() for page in pdf.pages]
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
if doc.name.endswith('.docx'):
|
| 64 |
-
raw_text = doc.read()
|
| 65 |
-
return docx2txt.process(raw_text)
|
| 66 |
|
| 67 |
return None
|
| 68 |
|
|
|
|
| 2 |
import torch
|
| 3 |
import torch.nn.functional as F
|
| 4 |
from torch import Tensor
|
| 5 |
+
|
|
|
|
| 6 |
import textract
|
|
|
|
|
|
|
|
|
|
| 7 |
import os
|
| 8 |
|
| 9 |
def last_token_pool(last_hidden_states: Tensor,
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
def extract_text(doc):
|
| 47 |
+
return textract.process(doc).decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
return None
|
| 50 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
torch
|
| 2 |
transformers
|
| 3 |
-
textract
|
| 4 |
-
docx2txt
|
| 5 |
-
pdfplumber
|
|
|
|
| 1 |
torch
|
| 2 |
transformers
|
| 3 |
+
textract
|
|
|
|
|
|