Spaces:
Sleeping
Sleeping
# Install necessary libraries | |
import os | |
import subprocess | |
# Function to install a package if it is not already installed | |
def install(package): | |
subprocess.check_call([os.sys.executable, "-m", "pip", "install", package]) | |
# Ensure the necessary packages are installed | |
install("transformers") | |
install("torch") | |
install("pandas") | |
install("scikit-learn") | |
install("gradio") | |
import os | |
import pandas as pd | |
import gradio as gr | |
from transformers import AutoModel, AutoTokenizer | |
import torch | |
from sklearn.model_selection import train_test_split | |
# Load your dataset | |
def load_dataset(): | |
file_path = "Valid-part-2.xlsx" | |
print(f"Current working directory: {os.getcwd()}") | |
if not os.path.exists(file_path): | |
raise FileNotFoundError(f"Dataset not found. Please ensure that '{file_path}' exists.") | |
try: | |
df = pd.read_excel(file_path) | |
print("Columns in the dataset:", df.columns.tolist()) | |
return df | |
except Exception as e: | |
print(f"Error loading dataset: {e}") | |
return None | |
# Preprocess the data | |
def preprocess_data(df): | |
# Example preprocessing: You can add more steps as needed | |
# For now, we're just returning the dataframe as is | |
return df | |
# Train your model | |
def train_model(df): | |
# Split the dataset into training and testing sets | |
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
# Load your pre-trained model and tokenizer from Hugging Face | |
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base") | |
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base") | |
# Normally, you would fine-tune the model here with your training data | |
# Since this is an example, we're returning the model as is | |
return model | |
# Define the Gradio interface function | |
def predict(input_text): | |
# Load the model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base") | |
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base") | |
# Tokenize input and make predictions | |
inputs = tokenizer(input_text, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Process the outputs as needed (e.g., extracting relevant information) | |
return outputs.last_hidden_state | |
# Build the Gradio interface | |
def build_interface(): | |
df = load_dataset() # Load your dataset | |
if df is None: | |
return None | |
df = preprocess_data(df) # Preprocess the dataset | |
model = train_model(df) # Train your model | |
iface = gr.Interface( | |
fn=predict, | |
inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."), | |
outputs="text" | |
) | |
return iface | |
# Run the Gradio interface | |
if __name__ == "__main__": | |
iface = build_interface() | |
if iface: | |
iface.launch() | |
else: | |
print("Failed to build the Gradio interface. Please check the dataset and model.") | |