import os import subprocess import whisper import requests from flask import Flask, request, jsonify, send_file import tempfile app = Flask(__name__) # Gemini API settings from dotenv import load_dotenv import requests # Load the .env file load_dotenv() # Fetch the API key from the .env file API_KEY = os.getenv("FIRST_API_KEY") # Ensure the API key is loaded correctly if not API_KEY: raise ValueError("API Key not found. Make sure it is set in the .env file.") GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent" GEMINI_API_KEY = API_KEY # Load Whisper AI model at startup print("Loading Whisper AI model...") whisper_model = whisper.load_model("base") # Choose model size: tiny, base, small, medium, large print("Whisper AI model loaded successfully.") # Define the "/" endpoint for health check @app.route("/", methods=["GET"]) def health_check(): return jsonify({"status": "success", "message": "API is running successfully!"}), 200 @app.route('/process-video', methods=['POST']) def process_video(): """ Flask endpoint to process video: 1. Extract audio and transcribe using Whisper AI. 2. Send transcription to Gemini API for recipe information extraction. 3. Return structured data in the response. """ if 'video' not in request.files: return jsonify({"error": "No video file provided"}), 400 video_file = request.files['video'] try: # Save video to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file: video_file.save(temp_video_file.name) print(f"Video file saved: {temp_video_file.name}") # Extract audio and transcribe using Whisper AI transcription = transcribe_audio(temp_video_file.name) if not transcription: return jsonify({"error": "Audio transcription failed"}), 500 # Generate structured recipe information using Gemini API structured_data = query_gemini_api(transcription) return jsonify(structured_data) except Exception as e: return jsonify({"error": str(e)}), 500 finally: # Clean up temporary files if os.path.exists(temp_video_file.name): os.remove(temp_video_file.name) def transcribe_audio(video_path): """ Extract audio from video file and transcribe using Whisper AI. """ try: # Extract audio using ffmpeg audio_path = video_path.replace(".mp4", ".wav") command = [ "ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a", audio_path ] subprocess.run(command, check=True) print(f"Audio extracted to: {audio_path}") # Transcribe audio using Whisper AI print("Transcribing audio...") result = whisper_model.transcribe(audio_path) # Clean up audio file after transcription if os.path.exists(audio_path): os.remove(audio_path) return result.get("text", "").strip() except Exception as e: print(f"Error in transcription: {e}") return None def query_gemini_api(transcription): """ Send transcription text to Gemini API and fetch structured recipe information. """ try: # Define the structured prompt prompt = ( "Analyze the provided cooking video transcription and extract the following structured information:\n" "1. Recipe Name: Identify the name of the dish being prepared.\n" "2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n" "3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n" "4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n" "5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n" "6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n" "7. Serving size: In count of people or portion size.\n" "8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n" "9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n" f"Text: {transcription}\n" ) # Prepare the payload and headers payload = { "contents": [ { "parts": [ {"text": prompt} ] } ] } headers = {"Content-Type": "application/json"} # Send request to Gemini API print("Querying Gemini API...") response = requests.post( f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}", json=payload, headers=headers ) response.raise_for_status() # Extract and return the structured data data = response.json() return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found") except requests.exceptions.RequestException as e: print(f"Error querying Gemini API: {e}") return {"error": str(e)} if __name__ == '__main__': app.run(debug=True)