Spaces:
Sleeping
Sleeping
import asyncio | |
import pandas as pd | |
from concurrent.futures import ThreadPoolExecutor | |
from flask import Flask, request, render_template | |
from catboost import CatBoostClassifier # Import CatBoost | |
from url_process import extract_url_features # Ensure you have the appropriate feature extraction function | |
import os | |
# Batch Processing: Ensures URLs are processed in manageable chunks | |
def process_urls_in_batches(urls, batch_size=10): | |
for i in range(0, len(urls), batch_size): | |
yield urls[i:i + batch_size] | |
# Async function for non-blocking DNS lookups and HTTP requests | |
async def async_extract_features(url): | |
features = await asyncio.to_thread(extract_url_features, url) | |
return features | |
# ThreadPoolExecutor for CPU-bound tasks like feature extraction | |
def extract_features_in_parallel(urls): | |
with ThreadPoolExecutor(max_workers=5) as executor: | |
return list(executor.map(extract_url_features, urls)) | |
# Load the CatBoost model for inference | |
def predict_with_catboost(features_df, model_path): | |
try: | |
print(f"Attempting to load model from: {model_path}") | |
print(f"File exists: {os.path.exists(model_path)}") | |
print(f"File size: {os.path.getsize(model_path)}") | |
model = CatBoostClassifier() | |
model.load_model(model_path) | |
predictions = model.predict(features_df) | |
return predictions | |
except Exception as e: | |
print(f"Error loading model: {str(e)}") | |
raise | |
# Flask App Setup | |
app = Flask(__name__) | |
async def index(): | |
result = None | |
url_features = None | |
if request.method == "POST": | |
# Get the URL input from the form | |
url = request.form["url"] | |
try: | |
# Asynchronously process the URL features | |
features = await async_extract_features(url) | |
# Convert the features to DataFrame (in case you need to do further processing) | |
features_df = pd.DataFrame([features]) | |
# Try multiple possible model locations | |
possible_paths = [ | |
os.path.join(os.getcwd(), "catboost_model.bin"), | |
"/app/catboost_model.bin", # Docker container path | |
"catboost_model.bin" | |
] | |
model_path = None | |
for path in possible_paths: | |
if os.path.exists(path): | |
model_path = path | |
break | |
if model_path is None: | |
raise FileNotFoundError("Model file not found in any expected location") | |
predictions = predict_with_catboost(features_df, model_path) | |
# Determine if the URL is malicious or legitimate | |
if predictions[0] == 1: | |
result = "Malicious" | |
else: | |
result = "Legitimate" | |
# Optionally, display the extracted features | |
url_features = features | |
except Exception as e: | |
result = f"Error processing URL: {str(e)}" | |
return render_template("index.html", result=result, url_features=url_features) | |
if __name__ == "__main__": | |
app.run(debug=False,host="0.0.0.0",port=7860) | |