Spaces:
Sleeping
Sleeping
File size: 3,239 Bytes
823ded0 a78f83f f13a3ca a78f83f c12ca9b 01df9cf f13a3ca c12ca9b e403126 01df9cf 68b29f4 01df9cf 68b29f4 a78f83f e403126 c12ca9b 01df9cf c12ca9b a78f83f c12ca9b 01df9cf c12ca9b 9e57aa8 c12ca9b e403126 a78f83f c12ca9b a78f83f e403126 c12ca9b e403126 c12ca9b e403126 c12ca9b a78f83f c12ca9b 68b29f4 99fbda0 01df9cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# Install necessary libraries
import os
import subprocess
# Function to install a package if it is not already installed
def install(package):
subprocess.check_call([os.sys.executable, "-m", "pip", "install", package])
# Ensure the necessary packages are installed
install("transformers")
install("torch")
install("pandas")
install("scikit-learn")
install("gradio")
import os
import pandas as pd
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.model_selection import train_test_split
from google.colab import files
# Upload the dataset if running in Google Colab
def upload_dataset():
uploaded = files.upload() # This will prompt the file upload
file_name = list(uploaded.keys())[0]
file_path = f'/content/{file_name}'
return file_path
# Load your dataset
def load_dataset():
file_path = '/content/Valid-part-2.xlsx' # Default path if the file is uploaded manually to Colab
# Check if the file exists
if not os.path.exists(file_path):
print(f"File not found at '{file_path}', prompting file upload...")
file_path = upload_dataset() # Upload if not found
try:
df = pd.read_excel(file_path)
print("Columns in the dataset:", df.columns.tolist())
return df
except Exception as e:
print(f"Error loading dataset: {e}")
return None
# Preprocess the data
def preprocess_data(df):
# Add your preprocessing steps here
# For example: cleaning, tokenization, etc.
return df
# Train your model
def train_model(df):
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Load your pre-trained model and tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base")
# Add your training code here
# This may involve tokenizing the data and feeding it into the model
return model
# Define the Gradio interface function
def predict(input_text):
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base")
# Tokenize input and make predictions
inputs = tokenizer(input_text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# Process the outputs as needed (e.g., extracting relevant information)
return outputs.last_hidden_state
# Build the Gradio interface
def build_interface():
df = load_dataset() # Load your dataset
if df is None:
return None
df = preprocess_data(df) # Preprocess the dataset
model = train_model(df) # Train your model
iface = gr.Interface(
fn=predict,
inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
outputs="text"
)
return iface
# Run the Gradio interface
if __name__ == "__main__":
iface = build_interface()
if iface:
iface.launch()
else:
print("Failed to build the Gradio interface. Please check the dataset and model.")
|