Spaces:

ruslanmv
/

Hotel-Recommendation-Multimodal

Paused

App Files Files Community

ruslanmv commited on Aug 14, 2024

Commit

c28812c

1 Parent(s): 43ba7b5

update

Browse files

Files changed (2) hide show

backend.py +79 -243
requirements.txt +1 -1

backend.py CHANGED Viewed

@@ -1,3 +1,13 @@
 import os
 import pandas as pd
 import requests
@@ -5,183 +15,107 @@ from PIL import Image, UnidentifiedImageError
 from io import BytesIO
 import matplotlib.pyplot as plt
 import urllib3
-from transformers import pipeline
-from transformers import BitsAndBytesConfig
 import torch
 import textwrap
-import pandas as pd
-import numpy as np
-from haversine import haversine  # Install haversine library: pip install haversine
-from transformers import AutoProcessor, LlavaForConditionalGeneration
-from transformers import BitsAndBytesConfig
-import torch
 from huggingface_hub import InferenceClient
-IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
-IS_SPACE = os.environ.get("SPACE_ID", None) is not None
-device = "cuda" if torch.cuda.is_available() else "cpu"
 LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
-print(f"Using device: {device}")
-print(f"low memory: {LOW_MEMORY}")
-# Define BitsAndBytesConfig
-# Ensure model is on the correct device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16
 )
-model_id = "llava-hf/llava-1.5-7b-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
-model.to(device)
-import os
-import requests
-url = 'https://github.com/ruslanmv/watsonx-with-multimodal-llava/raw/master/geocoded_hotels.csv'
-filename = 'geocoded_hotels.csv'
-# Check if the file already exists
-if not os.path.isfile(filename):
     response = requests.get(url)
     if response.status_code == 200:
-        with open(filename, 'wb') as f:
             f.write(response.content)
-        print(f"File {filename} downloaded successfully!")
     else:
         print(f"Error downloading file. Status code: {response.status_code}")
 else:
-    print(f"File {filename} already exists.")
-import os
-import pandas as pd
-from datasets import load_dataset
-import pyarrow
-# 1. Get the Current Directory
-current_directory = os.getcwd()
-# 2. Construct the Full Path to the CSV File
-csv_file_path = os.path.join(current_directory, 'hotel_multimodal.csv')
-# 3. Check if the file exists
 if not os.path.exists(csv_file_path):
-    # If not, download the dataset
-    print("File not found, downloading from Hugging Face...")
     dataset = load_dataset("ruslanmv/hotel-multimodal")
-    # Convert the 'train' dataset to a DataFrame using .to_pandas()
     df_hotels = dataset['train'].to_pandas()
-    # 4.Save to CSV
     df_hotels.to_csv(csv_file_path, index=False)
     print("Dataset downloaded and saved as CSV.")
-# 5. Read the CSV file
-df_hotels = pd.read_csv(csv_file_path)
-print("DataFrame loaded:")
-geocoded_hotels_path = os.path.join(current_directory, 'geocoded_hotels.csv')
-# Read the CSV file
-geocoded_hotels = pd.read_csv(geocoded_hotels_path)
-import requests
 def get_current_location():
     try:
         response = requests.get('https://ipinfo.io/json')
         data = response.json()
         location = data.get('loc', '')
         if location:
-            latitude, longitude = map(float, location.split(','))
-            return latitude, longitude
         else:
             return None, None
     except Exception as e:
         print(f"An error occurred: {e}")
         return None, None
-latitude, longitude = get_current_location()
-if latitude and longitude:
-    print(f"Current location: Latitude = {latitude}, Longitude = {longitude}")
-else:
-    print("Could not retrieve the current location.")
-from geopy.geocoders import Nominatim
 def get_coordinates(location_name):
-    """Fetches latitude and longitude coordinates for a given location name.
-    Args:
-        location_name (str): The name of the location (e.g., "Rome, Italy").
-    Returns:
-        tuple: A tuple containing the latitude and longitude (float values),
-               or None if the location is not found.
-    """
     geolocator = Nominatim(user_agent="coordinate_finder")
     location = geolocator.geocode(location_name)
     if location:
         return location.latitude, location.longitude
     else:
-        return None  # Location not found
 def find_nearby(place=None):
-    if place!=None:
         coordinates = get_coordinates(place)
         if coordinates:
             latitude, longitude = coordinates
             print(f"The coordinates of {place} are: Latitude: {latitude}, Longitude: {longitude}")
         else:
             print(f"Location not found: {place}")
     else:
         latitude, longitude = get_current_location()
-        if latitude and longitude:
-            print(f"Current location: Latitude = {latitude}, Longitude = {longitude}")
-    # Load the geocoded_hotels DataFrame
-    current_directory = os.getcwd()
-    geocoded_hotels_path = os.path.join(current_directory, 'geocoded_hotels.csv')
-    geocoded_hotels = pd.read_csv(geocoded_hotels_path)
-    # Define input coordinates for the reference location
-    reference_latitude = latitude
-    reference_longitude = longitude
-    # Haversine Distance Function
-    def calculate_haversine_distance(lat1, lon1, lat2, lon2):
-        """Calculates the Haversine distance between two points on the Earth's surface."""
-        return haversine((lat1, lon1), (lat2, lon2))
-    # Calculate distances to all other points in the DataFrame
     geocoded_hotels['distance_km'] = geocoded_hotels.apply(
-        lambda row: calculate_haversine_distance(
-            reference_latitude, reference_longitude, row['latitude'], row['longitude']
-        ),
         axis=1
     )
-    # Sort by distance and get the top 5 closest points
     closest_hotels = geocoded_hotels.sort_values(by='distance_km').head(5)
-    # Display the results
     print("The 5 closest locations are:\n")
     print(closest_hotels)
     return closest_hotels
@@ -189,60 +123,16 @@ def find_nearby(place=None):
 @spaces.GPU
 # Define the respond function
 def search_hotel(place=None):
-    import os
-    import pandas as pd
-    import requests
-    from PIL import Image, UnidentifiedImageError
-    from io import BytesIO
-    import urllib3
-    from transformers import pipeline
-    from transformers import BitsAndBytesConfig
-    import torch
-    # Suppress the InsecureRequestWarning
-    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-    # 1. Get the Current Directory
-    current_directory = os.getcwd()
-    # 2. Construct the Full Path to the CSV File
-    csv_file_path = os.path.join(current_directory, 'hotel_multimodal.csv')
-    # Read the CSV file
-    df_hotels = pd.read_csv(csv_file_path)
-    geocoded_hotels_path = os.path.join(current_directory, 'geocoded_hotels.csv')
-    # Read the CSV file
-    geocoded_hotels = pd.read_csv(geocoded_hotels_path)
-    # Assuming find_nearby function is defined elsewhere
     df_found = find_nearby(place)
-    # Converting df_found[["hotel_id"]].values to a list
     hotel_ids = df_found["hotel_id"].values.tolist()
-    # Extracting rows from df_hotels where hotel_id is in the list hotel_ids
     filtered_df = df_hotels[df_hotels['hotel_id'].isin(hotel_ids)]
-    # Ordering filtered_df by the order of hotel_ids
     filtered_df['hotel_id'] = pd.Categorical(filtered_df['hotel_id'], categories=hotel_ids, ordered=True)
     filtered_df = filtered_df.sort_values('hotel_id').reset_index(drop=True)
-    # Define the quantization config and model ID
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.float16
-    )
-    model_id = "llava-hf/llava-1.5-7b-hf"
-    # Initialize the pipeline
-    pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
-    # Group by hotel_id and take the first 2 image URLs for each hotel
     grouped_df = filtered_df.groupby('hotel_id', observed=True).head(2)
-    # Create a new DataFrame for storing image descriptions
     description_data = []
-    # Download and generate descriptions for the images
     for index, row in grouped_df.iterrows():
         hotel_id = row['hotel_id']
         hotel_name = row['hotel_name']
@@ -250,108 +140,71 @@ def search_hotel(place=None):
         try:
             response = requests.get(image_url, verify=False)
-            response.raise_for_status()  # Check for request errors
             img = Image.open(BytesIO(response.content))
-            # Generate description for the image
-            prompt = "USER: <image>\nAnalyze this image.  Give me feedback on whether this hotel is worth visiting based on the picture. Provide a summary  review.\nASSISTANT:"
-            outputs = pipe(img, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
             description = outputs[0]["generated_text"].split("\nASSISTANT:")[-1].strip()
-            # Append data to the list
-            description_data.append({
-                'hotel_name': hotel_name,
-                'hotel_id': hotel_id,
-                'image': img,
-                'description': description
-            })
         except (requests.RequestException, UnidentifiedImageError):
             print(f"Skipping image at URL: {image_url}")
-    # Create a DataFrame from the description data
-    description_df = pd.DataFrame(description_data)
-    return description_df
 def show_hotels(place=None):
     description_df = search_hotel(place)
-    # Calculate the number of rows needed
     num_images = len(description_df)
-    num_rows = (num_images + 1) // 2  # Two images per row
     fig, axs = plt.subplots(num_rows * 2, 2, figsize=(20, 10 * num_rows))
     current_index = 0
     for _, row in description_df.iterrows():
         img = row['image']
         description = row['description']
-        if img is None:  # Skip if the image is missing
             continue
         row_idx = (current_index // 2) * 2
         col_idx = current_index % 2
-        # Plot the image
         axs[row_idx, col_idx].imshow(img)
         axs[row_idx, col_idx].axis('off')
         axs[row_idx, col_idx].set_title(f"{row['hotel_name']}\nHotel ID: {row['hotel_id']} Image {current_index + 1}", fontsize=16)
-        # Wrap the description text
         wrapped_description = "\n".join(textwrap.wrap(description, width=50))
-        # Plot the description
         axs[row_idx + 1, col_idx].text(0.5, 0.5, wrapped_description, ha='center', va='center', wrap=True, fontsize=14)
         axs[row_idx + 1, col_idx].axis('off')
         current_index += 1
-    # Hide any unused subplots
-    total_plots = (current_index + 1) // 2 * 2
-    for j in range(current_index, total_plots * 2):
-        row_idx = (j // 2) * 2
-        col_idx = j % 2
-        if row_idx < num_rows * 2:
-            axs[row_idx, col_idx].axis('off')
-        if row_idx + 1 < num_rows * 2:
-            axs[row_idx + 1, col_idx].axis('off')
     plt.tight_layout()
     plt.show()
 def grouped_description(description_df):
-  # Group by 'hotel_id' and aggregate descriptions
-  grouped_descriptions = description_df.groupby('hotel_id')['description'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
-  # Merge with original DataFrame to get hotel names
-  result_df = pd.merge(grouped_descriptions, description_df[['hotel_id', 'hotel_name']], on='hotel_id', how='left')
-  # Drop duplicates and keep only the first occurrence of each hotel_id
-  result_df = result_df.drop_duplicates(subset='hotel_id', keep='first')
-  # Reorder columns
-  result_df = result_df[['hotel_name', 'hotel_id', 'description']]
-  return result_df
-# prompt: please create a new python function that given the result_df as an input create a single prompt where  for given hotel_name you append the hotel_id and description , such we can use later this as context for a future llm query
 def create_prompt_result(result_df):
-  prompt = ""
-  for _, row in result_df.iterrows():
-    hotel_name = row['hotel_name']
-    hotel_id = row['hotel_id']
-    description = row['description']
-    prompt += f"Hotel Name: {hotel_name}\nHotel ID: {hotel_id}\nDescription: {description}\n\n"
-  return prompt
-from transformers import pipeline, BitsAndBytesConfig
-import torch
-from langchain import PromptTemplate
-# Create a LangChain prompt template for the hotel recommendation
-hotel_recommendation_template = """
 <s>[INST] <<SYS>>
 You are a helpful and informative chatbot assistant.
 <</SYS>>
@@ -359,27 +212,10 @@ Based on the following hotel descriptions, recommend the best hotel:
 {context_result}
 [/INST]
 """
 @spaces.GPU
 # Define the respond function
-# Use LangChain to create a prompt based on the template
-def build_prompt(context_result):
-    prompt_template = PromptTemplate(template=hotel_recommendation_template)
-    return prompt_template.format(context_result=context_result)
-# Quantization configuration for efficient model loading
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16
-)
-# Initialize the text generation pipeline
-pipe_text = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2",
-                     model_kwargs={"quantization_config": quantization_config})
 def generate_text_response(prompt):
     outputs = pipe_text(prompt, max_new_tokens=500)
-    # Extract only the response after the instruction token
     response = outputs[0]['generated_text'].split("[/INST]")[-1].strip()
-    return response
-#place='Genova Italia'
-#show_hotels(place)

+from huggingface_hub import InferenceClient
+IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
+IS_SPACE = os.environ.get("SPACE_ID", None) is not None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
+print(f"Using device: {device}")
+print(f"low memory: {LOW_MEMORY}")
 import os
 import pandas as pd
 import requests
 from io import BytesIO
 import matplotlib.pyplot as plt
 import urllib3
+from transformers import pipeline, BitsAndBytesConfig
 import torch
 import textwrap
+from haversine import haversine
+from geopy.geocoders import Nominatim
 from huggingface_hub import InferenceClient
+# Constants
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
+MODEL_ID = "llava-hf/llava-1.5-7b-hf"
+TEXT_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
+# Print device and memory info
+print(f"Using device: {DEVICE}")
+print(f"Low memory: {LOW_MEMORY}")
+# Quantization configuration for efficient model loading
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16
 )
+# Load models only once
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = LlavaForConditionalGeneration.from_pretrained(MODEL_ID, quantization_config=quantization_config, device_map="auto").to(DEVICE)
+pipe_image_to_text = pipeline("image-to-text", model=model, model_kwargs={"quantization_config": quantization_config})
+# Initialize the text generation pipeline
+pipe_text = pipeline("text-generation", model=TEXT_MODEL_ID, model_kwargs={"quantization_config": quantization_config})
+# Ensure data files are available
+current_directory = os.getcwd()
+geocoded_hotels_path = os.path.join(current_directory, 'geocoded_hotels.csv')
+csv_file_path = os.path.join(current_directory, 'hotel_multimodal.csv')
+# Load geocoded hotels data
+if not os.path.isfile(geocoded_hotels_path):
+    url = 'https://github.com/ruslanmv/watsonx-with-multimodal-llava/raw/master/geocoded_hotels.csv'
     response = requests.get(url)
     if response.status_code == 200:
+        with open(geocoded_hotels_path, 'wb') as f:
             f.write(response.content)
+        print(f"File {geocoded_hotels_path} downloaded successfully!")
     else:
         print(f"Error downloading file. Status code: {response.status_code}")
 else:
+    print(f"File {geocoded_hotels_path} already exists.")
+geocoded_hotels = pd.read_csv(geocoded_hotels_path)
+# Load hotel dataset
 if not os.path.exists(csv_file_path):
     dataset = load_dataset("ruslanmv/hotel-multimodal")
     df_hotels = dataset['train'].to_pandas()
     df_hotels.to_csv(csv_file_path, index=False)
     print("Dataset downloaded and saved as CSV.")
+else:
+    df_hotels = pd.read_csv(csv_file_path)
 def get_current_location():
     try:
         response = requests.get('https://ipinfo.io/json')
         data = response.json()
         location = data.get('loc', '')
         if location:
+            return map(float, location.split(','))
         else:
             return None, None
     except Exception as e:
         print(f"An error occurred: {e}")
         return None, None
 def get_coordinates(location_name):
     geolocator = Nominatim(user_agent="coordinate_finder")
     location = geolocator.geocode(location_name)
     if location:
         return location.latitude, location.longitude
     else:
+        return None
 def find_nearby(place=None):
+    if place:
         coordinates = get_coordinates(place)
         if coordinates:
             latitude, longitude = coordinates
             print(f"The coordinates of {place} are: Latitude: {latitude}, Longitude: {longitude}")
         else:
             print(f"Location not found: {place}")
+            return None
     else:
         latitude, longitude = get_current_location()
+        if not latitude or not longitude:
+            print("Could not retrieve the current location.")
+            return None
     geocoded_hotels['distance_km'] = geocoded_hotels.apply(
+        lambda row: haversine((latitude, longitude), (row['latitude'], row['longitude'])),
         axis=1
     )
     closest_hotels = geocoded_hotels.sort_values(by='distance_km').head(5)
     print("The 5 closest locations are:\n")
     print(closest_hotels)
     return closest_hotels
 @spaces.GPU
 # Define the respond function
 def search_hotel(place=None):
     df_found = find_nearby(place)
+    if df_found is None:
+        return pd.DataFrame()
     hotel_ids = df_found["hotel_id"].values.tolist()
     filtered_df = df_hotels[df_hotels['hotel_id'].isin(hotel_ids)]
     filtered_df['hotel_id'] = pd.Categorical(filtered_df['hotel_id'], categories=hotel_ids, ordered=True)
     filtered_df = filtered_df.sort_values('hotel_id').reset_index(drop=True)
     grouped_df = filtered_df.groupby('hotel_id', observed=True).head(2)
     description_data = []
     for index, row in grouped_df.iterrows():
         hotel_id = row['hotel_id']
         hotel_name = row['hotel_name']
         try:
             response = requests.get(image_url, verify=False)
+            response.raise_for_status()
             img = Image.open(BytesIO(response.content))
+            prompt = "USER: <image>\nAnalyze this image. Give me feedback on whether this hotel is worth visiting based on the picture. Provide a summary review.\nASSISTANT:"
+            outputs = pipe_image_to_text(img, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
             description = outputs[0]["generated_text"].split("\nASSISTANT:")[-1].strip()
+            description_data.append({'hotel_name': hotel_name, 'hotel_id': hotel_id, 'image': img, 'description': description})
         except (requests.RequestException, UnidentifiedImageError):
             print(f"Skipping image at URL: {image_url}")
+    return pd.DataFrame(description_data)
 def show_hotels(place=None):
     description_df = search_hotel(place)
+    if description_df.empty:
+        print("No hotels found.")
+        return
     num_images = len(description_df)
+    num_rows = (num_images + 1) // 2
     fig, axs = plt.subplots(num_rows * 2, 2, figsize=(20, 10 * num_rows))
     current_index = 0
     for _, row in description_df.iterrows():
         img = row['image']
         description = row['description']
+        if img is None:
             continue
         row_idx = (current_index // 2) * 2
         col_idx = current_index % 2
         axs[row_idx, col_idx].imshow(img)
         axs[row_idx, col_idx].axis('off')
         axs[row_idx, col_idx].set_title(f"{row['hotel_name']}\nHotel ID: {row['hotel_id']} Image {current_index + 1}", fontsize=16)
         wrapped_description = "\n".join(textwrap.wrap(description, width=50))
         axs[row_idx + 1, col_idx].text(0.5, 0.5, wrapped_description, ha='center', va='center', wrap=True, fontsize=14)
         axs[row_idx + 1, col_idx].axis('off')
         current_index += 1
     plt.tight_layout()
     plt.show()
 def grouped_description(description_df):
+    grouped_descriptions = description_df.groupby('hotel_id')['description'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
+    result_df = pd.merge(grouped_descriptions, description_df[['hotel_id', 'hotel_name']], on='hotel_id', how='left')
+    result_df = result_df.drop_duplicates(subset='hotel_id', keep='first')
+    result_df = result_df[['hotel_name', 'hotel_id', 'description']]
+    return result_df
 def create_prompt_result(result_df):
+    prompt = ""
+    for _, row in result_df.iterrows():
+        hotel_name = row['hotel_name']
+        hotel_id = row['hotel_id']
+        description = row['description']
+        prompt += f"Hotel Name: {hotel_name}\nHotel ID: {hotel_id}\nDescription: {description}\n\n"
+    return prompt
+def build_prompt(context_result):
+    hotel_recommendation_template = """
 <s>[INST] <<SYS>>
 You are a helpful and informative chatbot assistant.
 <</SYS>>
 {context_result}
 [/INST]
 """
+    return hotel_recommendation_template.format(context_result=context_result)
 @spaces.GPU
 # Define the respond function
 def generate_text_response(prompt):
     outputs = pipe_text(prompt, max_new_tokens=500)
     response = outputs[0]['generated_text'].split("[/INST]")[-1].strip()
+    return response

requirements.txt CHANGED Viewed

@@ -6,4 +6,4 @@ haversine
 langchain
 accelerate
 bitsandbytes
-transformers @ git+https://github.com/huggingface/transformers.git

 langchain
 accelerate
 bitsandbytes
+transformers