Spaces:

Mohzen321
/

AboutKeywords

Sleeping

File size: 3,384 Bytes

ced22e1
 
1646c3f
ced22e1
f69939b
4c32405
ced22e1
 
b08284c
1646c3f
95a596a
b08284c
95a596a
 
 
 
b08284c
95a596a
1646c3f
b08284c
 
4e37c20
b08284c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e37c20
a00ff89
 
b08284c
 
1646c3f
b08284c
 
 
 
 
1646c3f
b08284c
 
 
 
1646c3f
b08284c
 
 
865820b
b08284c
 
 
4e37c20
b08284c
 
 
d4eaaae
902d69b
b08284c

import streamlit as st
from transformers import pipeline
import re

# تحميل النموذجssssss
classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-distilroberta-base")

# عنوان التطبيق
st.title("URL Analysis App")

# إدخال الملف النصي
uploaded_file = st.file_uploader("Upload a text file containing URLs", type=["txt"])

if uploaded_file is not None:
    # قراءة الملف النصي
    content = uploaded_file.read().decode("utf-8")
    urls = [line.strip() for line in content.splitlines() if line.strip()]

    # قوائم لتخزين النتائج
    parameters = []
    domains = []
    full_page_types = []
    file_extensions = []

    # دالة تحليل الروابط
    def analyze_urls(urls):
        for url in urls:
            # استخراج الباراميترات باستخدام RegEx
            params = re.findall(r'(\w+)=', url)
            parameters.extend(params)

            # استخtraction نطاقات (.com, .uk, .au)
            domain_match = re.search(r'\.([a-zA-Z]+)', url)
            if domain_match:
                domain = domain_match.group(1)
                if domain not in domains:
                    domains.append(domain)

            # استخراج أنماط الصفحات الكاملة (product_detail.php?, viewtopic.php?)
            page_type_match = re.search(r'(\w+\.[a-z]+)\?', url)
            if page_type_match:
                page_type = page_type_match.group(1)
                if page_type not in full_page_types:
                    full_page_types.append(page_type)

            # استخراج الصيغ (php, phtml, asp) بدون علامات الاستفهام
            extension_match = re.search(r'(\w+\.[a-z]+)(\?|$)', url)
            if extension_match:
                extension = extension_match.group(1).split('?')[0]
                if extension not in file_extensions:
                    file_extensions.append(extension)

    # زر البدء
    if st.button("Start"):
        # تحليل الروابط
        analyze_urls(urls)

    # إزالة التكرارات من القوائم
    parameters = list(set(parameters))
    domains = list(set(domains))
    full_page_types = list(set(full_page_types))
    file_extensions = list(set(file_extensions))

    # عرض النتائج
    st.header("Parameters")
    st.text_area("Copy the parameters here:", value="\n".join(parameters), height=200, key="parameters")
    st.button("Copy Parameters", on_click=lambda: st.clipboard.copy("\n".join(parameters)))

    st.header("Domains")
    st.text_area("Copy the domains here:", value="\n".join(domains), height=200, key="domains")
    st.button("Copy Domains", on_click=lambda: st.clipboard.copy("\n".join(domains)))

    st.header("Full PageType")
    st.text_area("Copy the full page types here:", value="\n".join(full_page_types), height=200, key="full_page_types")
    st.button("Copy Full PageTypes", on_click=lambda: st.clipboard.copy("\n".join(full_page_types)))

    st.header("File Extensions")
    st.text_area("Copy the file extensions here:", value="\n".join(file_extensions), height=200, key="file_extensions")
    st.button("Copy File Extensions", on_click=lambda: st.clipboard.copy("\n".join(file_extensions)))

else:
    st.warning("Please upload a text file containing URLs to start analysis.")