Spaces:
Build error
Build error
File size: 1,886 Bytes
c651eb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
import gradio as gr
def extract_wikipedia_text(raw_text, language):
contents = []
paragraph = ""
for element in raw_text:
# detected next headline
if element.name == "span":
if paragraph == "":
continue
contents.append({f"text-{language}": paragraph})
paragraph = ""
else:
clean_text = preprocessing(element.text)
if clean_text == "":
continue
if paragraph != "":
clean_text = " " + clean_text
paragraph += clean_text
return contents
def preprocessing(text):
# remove square brackets a.k.a citations
clean_text = re.sub("\[.*?]", "", text).strip()
# remove \n
clean_text = clean_text.replace("\n", "")
return clean_text
def scrape(url):
language = urlparse(url).netloc.split(".")[0]
try:
page = requests.get(url, headers={"user-agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.content, "html.parser")
except:
print("error")
title = soup.find("h1", {"id": "firstHeading"}).get_text().strip()
raw_text = soup.select(
"h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p"
)
contents = extract_wikipedia_text(raw_text, language)
json_output = {"source": url, f"title-{language}": title, "pages": contents}
return json_output
with gr.Blocks() as demo:
gr.Markdown(
f"""
<center>
<h1>Wikipedia Scraper 📜</h1>
</center>
"""
)
with gr.Row():
inp = gr.Textbox(placeholder="Wikipedia URL")
out = gr.JSON()
btn = gr.Button("Scrape")
btn.click(fn=scrape, inputs=inp, outputs=out)
demo.launch()
|