import os from pymongo import MongoClient from openai import OpenAI from dotenv import load_dotenv # Load environment variables load_dotenv() # Initialize clients openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) atlas_uri = os.getenv("ATLAS_URI") client = MongoClient(atlas_uri) db = client['sample_mflix'] collection = db['embedded_movies'] # Sample movie data sample_movies = [ { "title": "The Matrix", "year": 1999, "plot": "A computer programmer discovers that reality as he knows it is a simulation created by machines, and joins a rebellion to overthrow them." }, { "title": "Inception", "year": 2010, "plot": "A thief who enters the dreams of others to steal secrets from their subconscious is offered a chance to regain his old life in exchange for a task considered impossible: inception." }, { "title": "The Shawshank Redemption", "year": 1994, "plot": "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency." }, { "title": "Jurassic Park", "year": 1993, "plot": "A pragmatic paleontologist visiting an almost complete theme park is tasked with protecting a couple of kids after a power failure causes the park's cloned dinosaurs to run loose." }, { "title": "The Lord of the Rings: The Fellowship of the Ring", "year": 2001, "plot": "A young hobbit, Frodo, who has found the One Ring that belongs to the Dark Lord Sauron, begins his journey with eight companions to Mount Doom, the only place where it can be destroyed." } ] def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]: """Get embeddings for given text using OpenAI API""" text = text.replace("\n", " ") resp = openai_client.embeddings.create( input=[text], model=model ) return resp.data[0].embedding def setup_data(): try: # Drop existing collection if it exists collection.drop() print("Dropped existing collection") # Add embeddings to movies and insert into collection for movie in sample_movies: # Generate embedding for plot embedding = get_embedding(movie["plot"]) movie["plot_embedding"] = embedding # Insert movie with embedding collection.insert_one(movie) print(f"Inserted movie: {movie['title']}") print("\nData setup completed successfully!") print("\nIMPORTANT: You need to create the vector search index manually in the Atlas UI:") print("1. Go to your Atlas cluster") print("2. Click on 'Search' tab") print("3. Create an index named 'idx_plot_embedding' with this definition:") print(""" { "fields": [ { "type": "vector", "path": "plot_embedding", "numDimensions": 1536, "similarity": "dotProduct" } ] } """) except Exception as e: print(f"Error during setup: {str(e)}") finally: client.close() if __name__ == "__main__": setup_data()