AI-Interviewer / app.py
madi7a
feat: Add core application files and correct gitignore
d90a0a5
import gradio as gr
import time
import torch
import tempfile
import numpy as np
import scipy.io.wavfile as wavfile
from transformers import AutoProcessor, BarkModel
import whisper
import gradio as gr
import time
import tempfile
import numpy as np
import scipy.io.wavfile as wavfile
import cv2
import os
import json
from moviepy.editor import VideoFileClip
import shutil
# Bark TTS
model_bark = BarkModel.from_pretrained("suno/bark")
processor_bark = AutoProcessor.from_pretrained("suno/bark")
model_bark.to("cuda" if torch.cuda.is_available() else "cpu")
bark_voice_preset = "v2/en_speaker_6"
def bark_tts(text):
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
inputs = {k: v.to(model_bark.device) for k, v in inputs.items()}
speech_values = model_bark.generate(**inputs)
speech = speech_values.cpu().numpy().squeeze()
speech = (speech * 32767).astype(np.int16)
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
wavfile.write(temp_wav.name, 22050, speech)
return temp_wav.name
# Whisper STT
whisper_model = whisper.load_model("base")
def whisper_stt(audio_path):
if not audio_path or not os.path.exists(audio_path): return ""
result = whisper_model.transcribe(audio_path)
return result["text"]
# DeepFace (Video Face Emotion)
def ensure_mp4(video_input):
# video_input could be a file-like object, a path, or a Gradio temp path
if isinstance(video_input, str):
input_path = video_input
else:
# It's a file-like object (rare for Gradio video, but handle it)
with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as temp_in:
temp_in.write(video_input.read())
input_path = temp_in.name
# If already mp4, return as is
if input_path.endswith(".mp4"):
return input_path
# Convert to mp4 using moviepy
mp4_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
try:
clip = VideoFileClip(input_path)
clip.write_videofile(mp4_path, codec="libx264", audio=False, verbose=False, logger=None)
clip.close()
except Exception as e:
print("Video conversion failed:", e)
# As fallback, just copy original
shutil.copy(input_path, mp4_path)
return mp4_path
def analyze_video_emotions(video_input, sample_rate=15):
# Convert input to an mp4 file OpenCV can process
mp4_path = ensure_mp4(video_input)
if not mp4_path or not os.path.exists(mp4_path):
return "neutral"
cap = cv2.VideoCapture(mp4_path)
frame_count = 0
emotion_counts = {}
while True:
ret, frame = cap.read()
if not ret: break
if frame_count % sample_rate == 0:
try:
result = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
dominant = result[0]["dominant_emotion"] if isinstance(result, list) else result["dominant_emotion"]
emotion_counts[dominant] = emotion_counts.get(dominant, 0) + 1
except Exception: pass
frame_count += 1
cap.release()
if not emotion_counts: return "neutral"
return max(emotion_counts.items(), key=lambda x: x[1])[0]
wav2vec_model_name = "HaniaRuby/speech-emotion-recognition-wav2vec2"
wav2vec_processor = Wav2Vec2Processor.from_pretrained(wav2vec_model_name)
wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(wav2vec_model_name)
wav2vec_model.eval()
voice_label_map = {
0: 'angry', 1: 'disgust', 2: 'fear', 3: 'happy',
4: 'neutral', 5: 'sad', 6: 'surprise'
}
def analyze_audio_emotion(audio_path):
if not audio_path or not os.path.exists(audio_path): return "neutral"
speech, sr = librosa.load(audio_path, sr=16000)
inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = wav2vec_model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=-1)
predicted_id = torch.argmax(probs, dim=-1).item()
return voice_label_map.get(predicted_id, "neutral")
# --- Effective confidence calculation
def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
answer_score_map = {"excellent": 1.0, "good": 0.8, "medium": 0.6, "poor": 0.3}
voice_score, face_score, answer_score = emotion_map.get(voice_label, 0.5), emotion_map.get(face_label, 0.5), answer_score_map.get(answer_score_label, 0.5)
avg_emotion = (voice_score + face_score) / 2
control_bonus = max(0, answer_score - avg_emotion) * k
eff_conf = (0.5 * answer_score + 0.22 * voice_score + 0.18 * face_score + 0.1 * control_bonus)
return {"effective_confidence": round(eff_conf, 3), "answer_score": round(answer_score, 2), "voice_score": round(voice_score, 2), "face_score": round(face_score, 2), "control_bonus": round(control_bonus, 3)}
seniority_mapping = {
"Entry-level": 1, "Junior": 2, "Mid-Level": 3, "Senior": 4, "Lead": 5
}
import gradio as gr
import time
import tempfile
import numpy as np
import scipy.io.wavfile as wavfile
import cv2
import os
import json
# --- 2. Gradio App ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
user_data = gr.State({})
interview_state = gr.State({})
missing_fields_state = gr.State([])
# --- UI Layout ---
with gr.Column(visible=True) as user_info_section:
gr.Markdown("## Candidate Information")
cv_file = gr.File(label="Upload CV")
job_desc = gr.Textbox(label="Job Description")
start_btn = gr.Button("Continue", interactive=False)
with gr.Column(visible=False) as missing_section:
gr.Markdown("## Missing Information")
name_in = gr.Textbox(label="Name", visible=False)
role_in = gr.Textbox(label="Job Role", visible=False)
seniority_in = gr.Dropdown(list(seniority_mapping.keys()), label="Seniority", visible=False)
skills_in = gr.Textbox(label="Skills", visible=False)
submit_btn = gr.Button("Submit", interactive=False)
with gr.Column(visible=False) as interview_pre_section:
pre_interview_greeting_md = gr.Markdown()
start_interview_final_btn = gr.Button("Start Interview")
with gr.Column(visible=False) as interview_section:
gr.Markdown("## Interview in Progress")
question_audio = gr.Audio(label="Listen", interactive=False, autoplay=True)
question_text = gr.Markdown()
user_audio_input = gr.Audio(sources=["microphone"], type="filepath", label="1. Record Audio Answer")
user_video_input = gr.Video(sources=["webcam"], label="2. Record Video Answer")
stt_transcript = gr.Textbox(label="Transcribed Answer (edit if needed)")
confirm_btn = gr.Button("Confirm Answer")
evaluation_display = gr.Markdown()
emotion_display = gr.Markdown()
interview_summary = gr.Markdown(visible=False)
# --- UI Logic ---
def validate_start_btn(cv_file, job_desc):
return gr.update(interactive=(cv_file is not None and hasattr(cv_file, "name") and bool(job_desc and job_desc.strip())))
cv_file.change(validate_start_btn, [cv_file, job_desc], start_btn)
job_desc.change(validate_start_btn, [cv_file, job_desc], start_btn)
def process_and_route_initial(cv_file, job_desc):
details = extract_candidate_details(cv_file.name)
job_info = extract_job_details(job_desc)
data = {
"name": details.get("name", "unknown"), "job_role": job_info.get("job_title", "unknown"),
"seniority": job_info.get("experience_level", "unknown"), "skills": job_info.get("skills", [])
}
missing = [k for k, v in data.items() if (isinstance(v, str) and v.lower() == "unknown") or not v]
if missing:
return data, missing, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
else:
greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' when ready."
return data, missing, gr.update(visible=False), gr.update(visible=False), gr.update(visible=True, value=greeting)
start_btn.click(
process_and_route_initial,
[cv_file, job_desc],
[user_data, missing_fields_state, user_info_section, missing_section, pre_interview_greeting_md]
)
def show_missing(missing):
if missing is None: missing = []
return gr.update(visible="name" in missing), gr.update(visible="job_role" in missing), gr.update(visible="seniority" in missing), gr.update(visible="skills" in missing)
missing_fields_state.change(show_missing, missing_fields_state, [name_in, role_in, seniority_in, skills_in])
def validate_fields(name, role, seniority, skills, missing):
if not missing: return gr.update(interactive=False)
all_filled = all([(not ("name" in missing) or bool(name.strip())), (not ("job_role" in missing) or bool(role.strip())), (not ("seniority" in missing) or bool(seniority)), (not ("skills" in missing) or bool(skills.strip())),])
return gr.update(interactive=all_filled)
for inp in [name_in, role_in, seniority_in, skills_in]:
inp.change(validate_fields, [name_in, role_in, seniority_in, skills_in, missing_fields_state], submit_btn)
def complete_manual(data, name, role, seniority, skills):
if data["name"].lower() == "unknown": data["name"] = name
if data["job_role"].lower() == "unknown": data["job_role"] = role
if data["seniority"].lower() == "unknown": data["seniority"] = seniority
if not data["skills"]: data["skills"] = [s.strip() for s in skills.split(",")]
greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' to begin."
return data, gr.update(visible=False), gr.update(visible=True), gr.update(value=greeting)
submit_btn.click(complete_manual, [user_data, name_in, role_in, seniority_in, skills_in], [user_data, missing_section, interview_pre_section, pre_interview_greeting_md])
def start_interview(data):
# --- Advanced state with full logging ---
state = {
"questions": [], "answers": [], "face_labels": [], "voice_labels": [], "timings": [],
"question_evaluations": [], "answer_evaluations": [], "effective_confidences": [],
"conversation_history": [],
"difficulty_adjustment": None,
"question_idx": 0, "max_questions": 3, "q_start_time": time.time(),
"log": []
}
# --- Optionally: context retrieval here (currently just blank) ---
context = ""
prompt = build_interview_prompt(
conversation_history=[], user_response="", context=context, job_role=data["job_role"],
skills=data["skills"], seniority=data["seniority"], difficulty_adjustment=None,
voice_label="neutral", face_label="neutral"
)
first_q = groq_llm.predict(prompt)
# Evaluate Q for quality
q_eval = eval_question_quality(first_q, data["job_role"], data["seniority"], None)
state["questions"].append(first_q)
state["question_evaluations"].append(q_eval)
state["conversation_history"].append({'role': 'Interviewer', 'content': first_q})
audio_path = bark_tts(first_q)
# LOG
state["log"].append({"type": "question", "question": first_q, "question_eval": q_eval, "timestamp": time.time()})
return state, gr.update(visible=False), gr.update(visible=True), audio_path, f"*Question 1:* {first_q}"
start_interview_final_btn.click(start_interview, [user_data], [interview_state, interview_pre_section, interview_section, question_audio, question_text])
def transcribe(audio_path):
return whisper_stt(audio_path)
user_audio_input.change(transcribe, user_audio_input, stt_transcript)
def process_answer(transcript, audio_path, video_path, state, data):
if not transcript and not video_path:
return state, gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
elapsed = round(time.time() - state.get("q_start_time", time.time()), 2)
state["timings"].append(elapsed)
state["answers"].append(transcript)
state["conversation_history"].append({'role': 'Candidate', 'content': transcript})
# --- 1. Emotion analysis ---
voice_label = analyze_audio_emotion(audio_path)
face_label = analyze_video_emotions(video_path)
state["voice_labels"].append(voice_label)
state["face_labels"].append(face_label)
# --- 2. Evaluate previous Q and Answer ---
last_q = state["questions"][-1]
q_eval = state["question_evaluations"][-1] # Already in state
ref_answer = generate_reference_answer(last_q, data["job_role"], data["seniority"])
answer_eval = evaluate_answer(last_q, transcript, ref_answer, data["job_role"], data["seniority"], None)
state["answer_evaluations"].append(answer_eval)
answer_score = answer_eval.get("Score", "medium") if answer_eval else "medium"
# --- 3. Adaptive difficulty ---
if answer_score == "excellent":
state["difficulty_adjustment"] = "harder"
elif answer_score in ("medium", "poor"):
state["difficulty_adjustment"] = "easier"
else:
state["difficulty_adjustment"] = None
# --- 4. Effective confidence ---
eff_conf = interpret_confidence(voice_label, face_label, answer_score)
state["effective_confidences"].append(eff_conf)
# --- LOG ---
state["log"].append({
"type": "answer",
"question": last_q,
"answer": transcript,
"answer_eval": answer_eval,
"ref_answer": ref_answer,
"face_label": face_label,
"voice_label": voice_label,
"effective_confidence": eff_conf,
"timing": elapsed,
"timestamp": time.time()
})
# --- Next or End ---
qidx = state["question_idx"] + 1
if qidx >= state["max_questions"]:
# Save as JSON (optionally)
timestamp = time.strftime("%Y%m%d_%H%M%S")
log_file = f"interview_log_{timestamp}.json"
with open(log_file, "w", encoding="utf-8") as f:
json.dump(state["log"], f, indent=2, ensure_ascii=False)
# Report
summary = "# Interview Summary\n"
for i, q in enumerate(state["questions"]):
summary += (f"\n### Q{i + 1}: {q}\n"
f"- *Answer*: {state['answers'][i]}\n"
f"- *Q Eval*: {state['question_evaluations'][i]}\n"
f"- *A Eval*: {state['answer_evaluations'][i]}\n"
f"- *Face Emotion: {state['face_labels'][i]}, **Voice Emotion*: {state['voice_labels'][i]}\n"
f"- *Effective Confidence*: {state['effective_confidences'][i]['effective_confidence']}\n"
f"- *Time*: {state['timings'][i]}s\n")
summary += f"\n\n⏺ Full log saved as {log_file}."
return (state, gr.update(visible=True, value=summary), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"))
else:
# --- Build next prompt using adaptive difficulty ---
state["question_idx"] = qidx
state["q_start_time"] = time.time()
context = "" # You can add your context logic here
prompt = build_interview_prompt(
conversation_history=state["conversation_history"],
user_response=transcript,
context=context,
job_role=data["job_role"],
skills=data["skills"],
seniority=data["seniority"],
difficulty_adjustment=state["difficulty_adjustment"],
face_label=face_label,
voice_label=voice_label,
effective_confidence=eff_conf
)
next_q = groq_llm.predict(prompt)
# Evaluate Q quality
q_eval = eval_question_quality(next_q, data["job_role"], data["seniority"], None)
state["questions"].append(next_q)
state["question_evaluations"].append(q_eval)
state["conversation_history"].append({'role': 'Interviewer', 'content': next_q})
state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
audio_path = bark_tts(next_q)
# Display evaluations
eval_md = f"*Last Answer Eval:* {answer_eval}\n\n*Effective Confidence:* {eff_conf}"
return (
state, gr.update(visible=False), audio_path, f"*Question {qidx + 1}:* {next_q}",
gr.update(value=None), gr.update(value=None),
gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"),
)
confirm_btn.click(
process_answer,
[stt_transcript, user_audio_input, user_video_input, interview_state, user_data],
[interview_state, interview_summary, question_audio, question_text, user_audio_input, user_video_input, emotion_display]
).then(
lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, user_video_input]
)
demo.launch()