|
import gradio as gr |
|
import urllib.request |
|
import requests |
|
import bs4 |
|
import lxml |
|
|
|
def find_all(url,q=None,num=None): |
|
rawp = [] |
|
source = urllib.request.urlopen(url).read() |
|
soup = bs4.BeautifulSoup(source,'lxml') |
|
|
|
print(soup.title) |
|
|
|
print(soup.title.name) |
|
|
|
print(soup.title.string) |
|
|
|
print(soup.title.parent.name) |
|
|
|
|
|
|
|
for tag in soup.find_all(): |
|
|
|
try: |
|
|
|
rawp.append({tag.name:tag.text,"parent":tag.parent.name}) |
|
except Exception as e: |
|
print (e) |
|
rawp.append({tag.name:tag.text}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return rawp |
|
|
|
|
|
def find_it(url,q=None,num=None): |
|
out = [] |
|
source = urllib.request.urlopen(url).read() |
|
soup = bs4.BeautifulSoup(source,'lxml') |
|
for p in soup.find_all(f'{q}'): |
|
print(p.findChildren()) |
|
|
|
|
|
out.append([{q:p.string,"parent":p.parent.parent.name}]) |
|
|
|
|
|
for url in soup.find_all('a'): |
|
print(url.get('href')) |
|
|
|
|
|
return out |
|
|
|
def find_it2(url): |
|
response = requests.get(url,a1=None,q2=None,q3=None) |
|
try: |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.content, 'lxml') |
|
out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')]) |
|
return out |
|
except Exception as e: |
|
print (e) |
|
return e |
|
|
|
|
|
with gr.Blocks() as app: |
|
with gr.Row(): |
|
inp = gr.Textbox() |
|
q = gr.Textbox(value="p") |
|
num = gr.Number(value=1) |
|
with gr.Row(): |
|
all_btn = gr.Button("Load") |
|
find_btn = gr.Button("Find") |
|
with gr.Row(): |
|
rawp = gr.JSON() |
|
outp = gr.JSON() |
|
|
|
all_btn.click(find_all,[inp,q,num],[rawp]) |
|
find_btn.click(find_it,[inp,q,num],[outp]) |
|
|
|
app.launch() |
|
|
|
|