Spaces:
Sleeping
Sleeping
# Install necessary libraries | |
import os | |
import subprocess | |
# Function to install a package if it is not already installed | |
def install(package): | |
subprocess.check_call([os.sys.executable, "-m", "pip", "install", package]) | |
# Ensure the necessary packages are installed | |
install("transformers") | |
install("torch") | |
install("pandas") | |
install("scikit-learn") | |
install("gradio") | |
import os | |
import pandas as pd | |
import gradio as gr | |
from transformers import AutoModel, AutoTokenizer | |
import torch | |
from sklearn.model_selection import train_test_split | |
from google.colab import files | |
# Upload the dataset if running in Google Colab | |
def upload_dataset(): | |
uploaded = files.upload() # This will prompt the file upload | |
file_name = list(uploaded.keys())[0] | |
file_path = f'/content/{file_name}' | |
return file_path | |
# Load your dataset | |
def load_dataset(): | |
file_path = '/content/Valid-part-2.xlsx' # Default path if the file is uploaded manually to Colab | |
# Check if the file exists | |
if not os.path.exists(file_path): | |
print(f"File not found at '{file_path}', prompting file upload...") | |
file_path = upload_dataset() # Upload if not found | |
try: | |
df = pd.read_excel(file_path) | |
print("Columns in the dataset:", df.columns.tolist()) | |
return df | |
except Exception as e: | |
print(f"Error loading dataset: {e}") | |
return None | |
# Preprocess the data | |
def preprocess_data(df): | |
# Add your preprocessing steps here | |
# For example: cleaning, tokenization, etc. | |
return df | |
# Train your model | |
def train_model(df): | |
# Split the dataset into training and testing sets | |
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
# Load your pre-trained model and tokenizer from Hugging Face | |
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base") | |
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base") | |
# Add your training code here | |
# This may involve tokenizing the data and feeding it into the model | |
return model | |
# Define the Gradio interface function | |
def predict(input_text): | |
# Load the model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base") | |
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base") | |
# Tokenize input and make predictions | |
inputs = tokenizer(input_text, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Process the outputs as needed (e.g., extracting relevant information) | |
return outputs.last_hidden_state | |
# Build the Gradio interface | |
def build_interface(): | |
df = load_dataset() # Load your dataset | |
if df is None: | |
return None | |
df = preprocess_data(df) # Preprocess the dataset | |
model = train_model(df) # Train your model | |
iface = gr.Interface( | |
fn=predict, | |
inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."), | |
outputs="text" | |
) | |
return iface | |
# Run the Gradio interface | |
if __name__ == "__main__": | |
iface = build_interface() | |
if iface: | |
iface.launch() | |
else: | |
print("Failed to build the Gradio interface. Please check the dataset and model.") | |