Spaces:
Running
Running
File size: 588 Bytes
a4aec71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
"""
Textractor module
"""
import requests
from bs4 import BeautifulSoup
from txtai.pipeline.segmentation import Segmentation
class Textractor(Segmentation):
"""
Extracts text from files.
"""
def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
super().__init__(sentences, lines, paragraphs, minlength, join)
def text(self, text):
# text is a url
response = requests.get(text)
html = response.text
soup = BeautifulSoup(html, features="html.parser")
return soup.get_text() |