Engineer / app.py
atifsial123's picture
Update app.py
01df9cf verified
raw
history blame
3.24 kB
# Install necessary libraries
import os
import subprocess
# Function to install a package if it is not already installed
def install(package):
subprocess.check_call([os.sys.executable, "-m", "pip", "install", package])
# Ensure the necessary packages are installed
install("transformers")
install("torch")
install("pandas")
install("scikit-learn")
install("gradio")
import os
import pandas as pd
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.model_selection import train_test_split
from google.colab import files
# Upload the dataset if running in Google Colab
def upload_dataset():
uploaded = files.upload() # This will prompt the file upload
file_name = list(uploaded.keys())[0]
file_path = f'/content/{file_name}'
return file_path
# Load your dataset
def load_dataset():
file_path = '/content/Valid-part-2.xlsx' # Default path if the file is uploaded manually to Colab
# Check if the file exists
if not os.path.exists(file_path):
print(f"File not found at '{file_path}', prompting file upload...")
file_path = upload_dataset() # Upload if not found
try:
df = pd.read_excel(file_path)
print("Columns in the dataset:", df.columns.tolist())
return df
except Exception as e:
print(f"Error loading dataset: {e}")
return None
# Preprocess the data
def preprocess_data(df):
# Add your preprocessing steps here
# For example: cleaning, tokenization, etc.
return df
# Train your model
def train_model(df):
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Load your pre-trained model and tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base")
# Add your training code here
# This may involve tokenizing the data and feeding it into the model
return model
# Define the Gradio interface function
def predict(input_text):
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base")
# Tokenize input and make predictions
inputs = tokenizer(input_text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# Process the outputs as needed (e.g., extracting relevant information)
return outputs.last_hidden_state
# Build the Gradio interface
def build_interface():
df = load_dataset() # Load your dataset
if df is None:
return None
df = preprocess_data(df) # Preprocess the dataset
model = train_model(df) # Train your model
iface = gr.Interface(
fn=predict,
inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
outputs="text"
)
return iface
# Run the Gradio interface
if __name__ == "__main__":
iface = build_interface()
if iface:
iface.launch()
else:
print("Failed to build the Gradio interface. Please check the dataset and model.")