Commit
·
2c966e2
1
Parent(s):
f37c341
Adding application files
Browse files- .gitignore +3 -0
- Dockerfile +13 -0
- README.md +3 -1
- helpers/__init__.py +15 -0
- helpers/audio_removal.py +23 -0
- helpers/dense_sampling.py +26 -0
- helpers/extract_faces.py +101 -0
- helpers/file_utils.py +28 -0
- main.py +29 -0
- middleware/__init__.py +1 -0
- middleware/cleanup_middleware.py +16 -0
- model/README.md +24 -0
- modelfile.py +183 -0
- predict/__init_.py +1 -0
- predict/model_predictor.py +39 -0
- preprocessing/__init__.py +5 -0
- preprocessing/preprocess_video.py +36 -0
- requirements.txt +75 -0
- routes/__init__.py +1 -0
- routes/video_routes.py +76 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
3 |
+
*.pyo
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
|
7 |
+
WORKDIR /deepfake-video-detection
|
8 |
+
|
9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
+
|
12 |
+
COPY --chown=user . /deepfake-video-detection
|
13 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -9,4 +9,6 @@ license: mit
|
|
9 |
short_description: FastAPI Backend for DeepFake Video Detection
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
9 |
short_description: FastAPI Backend for DeepFake Video Detection
|
10 |
---
|
11 |
|
12 |
+
# CViT Deepfake Detection Model
|
13 |
+
|
14 |
+
This repository contains the inference code for the CViT-based deepfake video detection model. Due to file size limitations, the model weights are hosted on Hugging Face Hub.
|
helpers/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .dense_sampling import dense_sampling_from_extracted_frames
|
2 |
+
from .audio_removal import remove_audio
|
3 |
+
from .extract_faces import detect_faces_in_video
|
4 |
+
from .file_utils import (
|
5 |
+
delete_folders,
|
6 |
+
delete_videos
|
7 |
+
)
|
8 |
+
|
9 |
+
__all__=[
|
10 |
+
"dense_sampling_from_extracted_frames",
|
11 |
+
"remove_audio",
|
12 |
+
"detect_faces_in_video",
|
13 |
+
"delete_folders",
|
14 |
+
"delete_videos"
|
15 |
+
]
|
helpers/audio_removal.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
|
4 |
+
def remove_audio(input_file):
|
5 |
+
output_file = f"processed_{os.path.basename(input_file).rsplit('.', 1)[0]}.mp4"
|
6 |
+
|
7 |
+
ffmpeg_cmd = [
|
8 |
+
'ffmpeg',
|
9 |
+
'-i', input_file,
|
10 |
+
'-c:v', 'libx264',
|
11 |
+
'-preset', 'ultrafast',
|
12 |
+
'-an',
|
13 |
+
'-y',
|
14 |
+
output_file
|
15 |
+
]
|
16 |
+
|
17 |
+
try:
|
18 |
+
result = subprocess.run(ffmpeg_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
19 |
+
print(f"Processed video saved to: {output_file}")
|
20 |
+
return output_file
|
21 |
+
except Exception as e:
|
22 |
+
print(f"Unexpected error: {e}")
|
23 |
+
return None
|
helpers/dense_sampling.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
def dense_sampling_from_extracted_frames(folder_path, num_clips=6, frames_per_clip=5):
|
6 |
+
frame_files = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.npy')])
|
7 |
+
num_frames = len(frame_files)
|
8 |
+
|
9 |
+
print(f"Found {num_frames} frames in {folder_path}")
|
10 |
+
if num_frames < num_clips * frames_per_clip:
|
11 |
+
raise ValueError("Not enough frames to sample the required clips.")
|
12 |
+
|
13 |
+
frames_per_segment = num_frames // num_clips
|
14 |
+
|
15 |
+
clips = []
|
16 |
+
|
17 |
+
for i in range(num_clips):
|
18 |
+
segment_start = i * frames_per_segment
|
19 |
+
segment_end = segment_start + frames_per_segment - 1
|
20 |
+
max_start_frame = segment_end - frames_per_clip + 1
|
21 |
+
start_frame = random.randint(segment_start, max_start_frame)
|
22 |
+
|
23 |
+
clip = [np.load(frame_files[start_frame + j]) for j in range(frames_per_clip)]
|
24 |
+
clips.append(clip)
|
25 |
+
|
26 |
+
return clips
|
helpers/extract_faces.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from tqdm import tqdm
|
5 |
+
from mtcnn import MTCNN
|
6 |
+
|
7 |
+
def normalize_frame(frame, mean, std):
|
8 |
+
frame = frame / 255.0
|
9 |
+
mean = np.array(mean).reshape(1, 1, 3)
|
10 |
+
std = np.array(std).reshape(1, 1, 3)
|
11 |
+
normalized_frame = (frame - mean) / std
|
12 |
+
return normalized_frame
|
13 |
+
|
14 |
+
def detect_faces_in_video(video_path, output_dir, padding_percentage=0.3,
|
15 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225],
|
16 |
+
full_detection_interval=10):
|
17 |
+
os.makedirs(output_dir, exist_ok=True)
|
18 |
+
|
19 |
+
detector = MTCNN()
|
20 |
+
cap = cv2.VideoCapture(video_path)
|
21 |
+
if not cap.isOpened():
|
22 |
+
raise Exception(f"Error: Unable to open video file {video_path}")
|
23 |
+
|
24 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
25 |
+
frame_count = 0
|
26 |
+
cropped_faces = []
|
27 |
+
trackers = []
|
28 |
+
|
29 |
+
with tqdm(total=total_frames, desc="Extracting faces", unit="frame") as pbar:
|
30 |
+
while True:
|
31 |
+
ret, frame = cap.read()
|
32 |
+
if not ret:
|
33 |
+
break
|
34 |
+
|
35 |
+
if frame is None:
|
36 |
+
print(f"[WARNING] Empty frame at {frame_count}")
|
37 |
+
continue
|
38 |
+
|
39 |
+
if frame_count % full_detection_interval == 0:
|
40 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
41 |
+
faces = detector.detect_faces(rgb_frame)
|
42 |
+
trackers = []
|
43 |
+
|
44 |
+
for i, face in enumerate(faces):
|
45 |
+
confidence = face['confidence']
|
46 |
+
if confidence < 0.85:
|
47 |
+
continue
|
48 |
+
|
49 |
+
x, y, w, h = face['box']
|
50 |
+
if w < 50 or h < 50:
|
51 |
+
continue
|
52 |
+
|
53 |
+
padding = max(1, int(min(w, h) * padding_percentage))
|
54 |
+
x1 = max(0, x - padding)
|
55 |
+
y1 = max(0, y - padding)
|
56 |
+
x2 = min(rgb_frame.shape[1], x + w + padding)
|
57 |
+
y2 = min(rgb_frame.shape[0], y + h + padding)
|
58 |
+
|
59 |
+
cropped_face = frame[y1:y2, x1:x2]
|
60 |
+
if cropped_face.size == 0:
|
61 |
+
continue
|
62 |
+
|
63 |
+
resized_cropped_face = cv2.resize(cropped_face, (224, 224))
|
64 |
+
normalized_face = normalize_frame(resized_cropped_face, mean, std)
|
65 |
+
|
66 |
+
face_filename = f"frame_{frame_count:05d}_face_{i}.npy"
|
67 |
+
face_path = os.path.join(output_dir, face_filename)
|
68 |
+
np.save(face_path, normalized_face)
|
69 |
+
cropped_faces.append(face_path)
|
70 |
+
|
71 |
+
tracker = cv2.TrackerCSRT_create()
|
72 |
+
tracker.init(frame, (x, y, w, h))
|
73 |
+
trackers.append(tracker)
|
74 |
+
else:
|
75 |
+
for i, tracker in enumerate(trackers):
|
76 |
+
success, box = tracker.update(frame)
|
77 |
+
if success:
|
78 |
+
x, y, w, h = [int(v) for v in box]
|
79 |
+
padding = max(1, int(min(w, h) * padding_percentage))
|
80 |
+
x1 = max(0, x - padding)
|
81 |
+
y1 = max(0, y - padding)
|
82 |
+
x2 = min(frame.shape[1], x + w + padding)
|
83 |
+
y2 = min(frame.shape[0], y + h + padding)
|
84 |
+
|
85 |
+
cropped_face = frame[y1:y2, x1:x2]
|
86 |
+
if cropped_face.size == 0:
|
87 |
+
continue
|
88 |
+
|
89 |
+
resized_cropped_face = cv2.resize(cropped_face, (224, 224))
|
90 |
+
normalized_face = normalize_frame(resized_cropped_face, mean, std)
|
91 |
+
|
92 |
+
face_filename = f"frame_{frame_count:05d}_track_{i}.npy"
|
93 |
+
face_path = os.path.join(output_dir, face_filename)
|
94 |
+
np.save(face_path, normalized_face)
|
95 |
+
cropped_faces.append(face_path)
|
96 |
+
|
97 |
+
frame_count += 1
|
98 |
+
pbar.update(1)
|
99 |
+
|
100 |
+
cap.release()
|
101 |
+
return cropped_faces
|
helpers/file_utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import glob
|
4 |
+
|
5 |
+
def delete_videos(video_filenames):
|
6 |
+
for video_filename in video_filenames:
|
7 |
+
if "*" in video_filename:
|
8 |
+
matched_files = glob.glob(video_filename)
|
9 |
+
for file in matched_files:
|
10 |
+
try:
|
11 |
+
os.remove(file)
|
12 |
+
print(f"{file} has been deleted successfully.")
|
13 |
+
except Exception as e:
|
14 |
+
print(f"Error deleting {file}: {e}")
|
15 |
+
else:
|
16 |
+
try:
|
17 |
+
if os.path.exists(video_filename):
|
18 |
+
os.remove(video_filename)
|
19 |
+
print(f"{video_filename} has been deleted successfully.")
|
20 |
+
else:
|
21 |
+
print(f"{video_filename} does not exist.")
|
22 |
+
except Exception as e:
|
23 |
+
print(f"Error deleting {video_filename}: {e}")
|
24 |
+
|
25 |
+
def delete_folders(*folders):
|
26 |
+
for folder in folders:
|
27 |
+
if os.path.exists(folder):
|
28 |
+
shutil.rmtree(folder)
|
main.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
|
3 |
+
|
4 |
+
from fastapi import FastAPI
|
5 |
+
from fastapi.responses import JSONResponse
|
6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
7 |
+
|
8 |
+
from routes import router as video_routes
|
9 |
+
from middleware import CleanupMiddleware
|
10 |
+
|
11 |
+
app = FastAPI()
|
12 |
+
|
13 |
+
app.add_middleware(
|
14 |
+
CORSMiddleware,
|
15 |
+
allow_origins=["*"],
|
16 |
+
allow_credentials=True,
|
17 |
+
allow_methods=["*"],
|
18 |
+
allow_headers=["*"],
|
19 |
+
)
|
20 |
+
|
21 |
+
app.add_middleware(CleanupMiddleware)
|
22 |
+
|
23 |
+
app.include_router(video_routes)
|
24 |
+
|
25 |
+
@app.get("/")
|
26 |
+
async def read_root():
|
27 |
+
return JSONResponse(
|
28 |
+
content= {"status": "API is running"}
|
29 |
+
)
|
middleware/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .cleanup_middleware import CleanupMiddleware
|
middleware/cleanup_middleware.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from starlette.middleware.base import BaseHTTPMiddleware
|
2 |
+
from starlette.requests import Request
|
3 |
+
from helpers.file_utils import delete_folders, delete_videos
|
4 |
+
|
5 |
+
class CleanupMiddleware(BaseHTTPMiddleware):
|
6 |
+
async def dispatch(self, request: Request, call_next):
|
7 |
+
response = await call_next(request)
|
8 |
+
|
9 |
+
output_dir_for_extracted_frames = "extracted_frames"
|
10 |
+
output_dir_for_sampled_frames = "sampled_frames"
|
11 |
+
video_files_to_delete = ["processed*.mp4"]
|
12 |
+
|
13 |
+
delete_folders(output_dir_for_extracted_frames, output_dir_for_sampled_frames)
|
14 |
+
delete_videos(video_files_to_delete)
|
15 |
+
|
16 |
+
return response
|
model/README.md
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## ✅ No Manual Download Needed
|
2 |
+
|
3 |
+
The model file (`cvit2_deepfake_detection_ep_50.pth`) is hosted on the Hugging Face Hub and is **automatically downloaded** when you run the code.
|
4 |
+
|
5 |
+
If you're running the project locally, make sure your environment has internet access. The model will be downloaded from:
|
6 |
+
|
7 |
+
👉 [cvit2_deepfake_detection_ep_50.pth on Hugging Face](https://huggingface.co/mhamza-007/cvit_deepfake_detection/tree/main)
|
8 |
+
|
9 |
+
Once downloaded, it will be **cached locally** for future use.
|
10 |
+
|
11 |
+
---
|
12 |
+
|
13 |
+
### Programmatic Model Download
|
14 |
+
|
15 |
+
To avoid manual downloads, the code uses the Hugging Face Hub API to download the model automatically:
|
16 |
+
|
17 |
+
```python
|
18 |
+
from huggingface_hub import hf_hub_download
|
19 |
+
|
20 |
+
model_path = hf_hub_download(
|
21 |
+
repo_id="mhamza-007/cvit_deepfake_detection",
|
22 |
+
filename="cvit2_deepfake_detection_ep_50.pth"
|
23 |
+
)
|
24 |
+
```
|
modelfile.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from einops import rearrange
|
5 |
+
|
6 |
+
class Residual(nn.Module):
|
7 |
+
def __init__(self, fn):
|
8 |
+
super().__init__()
|
9 |
+
self.fn = fn
|
10 |
+
|
11 |
+
def forward(self, x, **kwargs):
|
12 |
+
return self.fn(x, **kwargs) + x
|
13 |
+
|
14 |
+
class PreNorm(nn.Module):
|
15 |
+
def __init__(self, dim, fn):
|
16 |
+
super().__init__()
|
17 |
+
self.norm = nn.LayerNorm(dim)
|
18 |
+
self.fn = fn
|
19 |
+
|
20 |
+
def forward(self, x, **kwargs):
|
21 |
+
return self.fn(self.norm(x), **kwargs)
|
22 |
+
|
23 |
+
class FeedForward(nn.Module):
|
24 |
+
def __init__(self, dim, hidden_dim):
|
25 |
+
super().__init__()
|
26 |
+
self.net = nn.Sequential(
|
27 |
+
nn.Linear(dim, hidden_dim),
|
28 |
+
nn.GELU(),
|
29 |
+
nn.Linear(hidden_dim, dim)
|
30 |
+
)
|
31 |
+
|
32 |
+
def forward(self, x):
|
33 |
+
return self.net(x)
|
34 |
+
|
35 |
+
class Attention(nn.Module):
|
36 |
+
def __init__(self, dim, heads=8):
|
37 |
+
super().__init__()
|
38 |
+
self.heads = heads
|
39 |
+
self.scale = dim ** -0.5
|
40 |
+
|
41 |
+
self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
|
42 |
+
self.to_out = nn.Linear(dim, dim)
|
43 |
+
|
44 |
+
def forward(self, x, mask = None):
|
45 |
+
b, n, _, h = *x.shape, self.heads
|
46 |
+
qkv = self.to_qkv(x)
|
47 |
+
q, k, v = rearrange(qkv, 'b n (qkv h d) -> qkv b h n d', qkv=3, h=h)
|
48 |
+
|
49 |
+
dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
|
50 |
+
|
51 |
+
if mask is not None:
|
52 |
+
mask = F.pad(mask.flatten(1), (1, 0), value = True)
|
53 |
+
assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
|
54 |
+
mask = mask[:, None, :] * mask[:, :, None]
|
55 |
+
dots.masked_fill_(~mask, float('-inf'))
|
56 |
+
del mask
|
57 |
+
|
58 |
+
attn = dots.softmax(dim=-1)
|
59 |
+
|
60 |
+
out = torch.einsum('bhij,bhjd->bhid', attn, v)
|
61 |
+
out = rearrange(out, 'b h n d -> b n (h d)')
|
62 |
+
out = self.to_out(out)
|
63 |
+
return out
|
64 |
+
|
65 |
+
class Transformer(nn.Module):
|
66 |
+
def __init__(self, dim, depth, heads, mlp_dim):
|
67 |
+
super().__init__()
|
68 |
+
self.layers = nn.ModuleList([])
|
69 |
+
for _ in range(depth):
|
70 |
+
self.layers.append(nn.ModuleList([
|
71 |
+
Residual(PreNorm(dim, Attention(dim, heads = heads))),
|
72 |
+
Residual(PreNorm(dim, FeedForward(dim, mlp_dim)))
|
73 |
+
]))
|
74 |
+
|
75 |
+
def forward(self, x, mask=None):
|
76 |
+
for attn, ff in self.layers:
|
77 |
+
x = attn(x, mask=mask)
|
78 |
+
x = ff(x)
|
79 |
+
return x
|
80 |
+
|
81 |
+
class CViT(nn.Module):
|
82 |
+
def __init__(self, image_size=224, patch_size=7, num_classes=2, channels=512,
|
83 |
+
dim=1024, depth=6, heads=8, mlp_dim=2048):
|
84 |
+
super().__init__()
|
85 |
+
assert image_size % patch_size == 0, 'image dimensions must be divisible by the patch size'
|
86 |
+
|
87 |
+
self.features = nn.Sequential(
|
88 |
+
|
89 |
+
nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
|
90 |
+
nn.BatchNorm2d(num_features=32),
|
91 |
+
nn.ReLU(),
|
92 |
+
nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
|
93 |
+
nn.BatchNorm2d(num_features=32),
|
94 |
+
nn.ReLU(),
|
95 |
+
nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
|
96 |
+
nn.BatchNorm2d(num_features=32),
|
97 |
+
nn.ReLU(),
|
98 |
+
nn.MaxPool2d(kernel_size=2, stride=2),
|
99 |
+
|
100 |
+
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
|
101 |
+
nn.BatchNorm2d(num_features=64),
|
102 |
+
nn.ReLU(),
|
103 |
+
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
|
104 |
+
nn.BatchNorm2d(num_features=64),
|
105 |
+
nn.ReLU(),
|
106 |
+
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
|
107 |
+
nn.BatchNorm2d(num_features=64),
|
108 |
+
nn.ReLU(),
|
109 |
+
nn.MaxPool2d(kernel_size=2, stride=2),
|
110 |
+
|
111 |
+
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
|
112 |
+
nn.BatchNorm2d(num_features=128),
|
113 |
+
nn.ReLU(),
|
114 |
+
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
|
115 |
+
nn.BatchNorm2d(num_features=128),
|
116 |
+
nn.ReLU(),
|
117 |
+
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
|
118 |
+
nn.BatchNorm2d(num_features=128),
|
119 |
+
nn.ReLU(),
|
120 |
+
nn.MaxPool2d(kernel_size=2, stride=2),
|
121 |
+
|
122 |
+
nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
|
123 |
+
nn.BatchNorm2d(num_features=256),
|
124 |
+
nn.ReLU(),
|
125 |
+
nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
|
126 |
+
nn.BatchNorm2d(num_features=256),
|
127 |
+
nn.ReLU(),
|
128 |
+
nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
|
129 |
+
nn.BatchNorm2d(num_features=256),
|
130 |
+
nn.ReLU(),
|
131 |
+
nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
|
132 |
+
nn.BatchNorm2d(num_features=256),
|
133 |
+
nn.ReLU(),
|
134 |
+
nn.MaxPool2d(kernel_size=2, stride=2),
|
135 |
+
|
136 |
+
nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
|
137 |
+
nn.BatchNorm2d(num_features=512),
|
138 |
+
nn.ReLU(),
|
139 |
+
nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
|
140 |
+
nn.BatchNorm2d(num_features=512),
|
141 |
+
nn.ReLU(),
|
142 |
+
nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
|
143 |
+
nn.BatchNorm2d(num_features=512),
|
144 |
+
nn.ReLU(),
|
145 |
+
nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
|
146 |
+
nn.BatchNorm2d(num_features=512),
|
147 |
+
nn.ReLU(),
|
148 |
+
nn.MaxPool2d(kernel_size=2, stride=2)
|
149 |
+
)
|
150 |
+
|
151 |
+
num_patches = (image_size // patch_size) ** 2
|
152 |
+
self.max_sequence_length = num_patches+1
|
153 |
+
patch_dim = channels * patch_size ** 2
|
154 |
+
|
155 |
+
self.patch_size = patch_size
|
156 |
+
|
157 |
+
self.pos_embedding = nn.Parameter(torch.randn(1, self.max_sequence_length, dim))
|
158 |
+
self.patch_to_embedding = nn.Linear(patch_dim, dim)
|
159 |
+
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
|
160 |
+
self.transformer = Transformer(dim, depth, heads, mlp_dim)
|
161 |
+
|
162 |
+
self.to_cls_token = nn.Identity()
|
163 |
+
|
164 |
+
self.mlp_head = nn.Sequential(
|
165 |
+
nn.Linear(dim, mlp_dim),
|
166 |
+
nn.ReLU(),
|
167 |
+
nn.Linear(mlp_dim, num_classes)
|
168 |
+
)
|
169 |
+
|
170 |
+
def forward(self, img, mask=None):
|
171 |
+
p = self.patch_size
|
172 |
+
x = self.features(img)
|
173 |
+
y = rearrange(x, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
|
174 |
+
|
175 |
+
y = self.patch_to_embedding(y)
|
176 |
+
cls_tokens = self.cls_token.expand(y.shape[0], -1, -1)
|
177 |
+
x = torch.cat((cls_tokens, y), dim=1)
|
178 |
+
|
179 |
+
x += self.pos_embedding[:, :x.size(1)]
|
180 |
+
x = self.transformer(x, mask)
|
181 |
+
x = self.to_cls_token(x[:, 0])
|
182 |
+
|
183 |
+
return self.mlp_head(x)
|
predict/__init_.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .model_predictor import predict_with_model
|
predict/model_predictor.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from modelfile import CViT
|
3 |
+
from huggingface_hub import hf_hub_download
|
4 |
+
|
5 |
+
def predict_with_model(saved_frames):
|
6 |
+
print("PyTorch Version:", torch.__version__)
|
7 |
+
print("Is CUDA Available:", torch.cuda.is_available())
|
8 |
+
|
9 |
+
if torch.cuda.is_available():
|
10 |
+
print("CUDA Version:", torch.version.cuda)
|
11 |
+
print("Available GPU:", torch.cuda.get_device_name(0))
|
12 |
+
else:
|
13 |
+
print("CUDA is not available. Ensure you have installed a CUDA-enabled version of PyTorch.")
|
14 |
+
|
15 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
16 |
+
input_data = torch.tensor(saved_frames, dtype=torch.float32).to(device)
|
17 |
+
|
18 |
+
model_path = hf_hub_download(
|
19 |
+
repo_id="mhamza-007/cvit_deepfake_detection",
|
20 |
+
filename="cvit2_deepfake_detection_ep_50.pth"
|
21 |
+
)
|
22 |
+
|
23 |
+
model = CViT()
|
24 |
+
model.load_state_dict(torch.load(model_path, map_location=device, weights_only=True)['state_dict'])
|
25 |
+
model = model.to(device)
|
26 |
+
|
27 |
+
with torch.no_grad():
|
28 |
+
output = model(input_data)
|
29 |
+
|
30 |
+
predictions = torch.softmax(output, dim=1)
|
31 |
+
predicted_classes = torch.argmax(predictions, dim=1)
|
32 |
+
|
33 |
+
output = output.cpu()
|
34 |
+
predictions = predictions.cpu()
|
35 |
+
predicted_classes = predicted_classes.cpu()
|
36 |
+
|
37 |
+
print("Predicted Classes:", predicted_classes)
|
38 |
+
|
39 |
+
return predicted_classes
|
preprocessing/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .preprocess_video import (
|
2 |
+
remove_audio_from_video,
|
3 |
+
extract_face_from_video,
|
4 |
+
sample_frames_from_extracted_frames,
|
5 |
+
)
|
preprocessing/preprocess_video.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from helpers import detect_faces_in_video
|
5 |
+
from helpers import dense_sampling_from_extracted_frames
|
6 |
+
from helpers import remove_audio
|
7 |
+
|
8 |
+
def remove_audio_from_video(input_file):
|
9 |
+
return remove_audio(input_file)
|
10 |
+
|
11 |
+
|
12 |
+
def extract_face_from_video(video_path, output_dir):
|
13 |
+
os.makedirs(output_dir, exist_ok=True)
|
14 |
+
cropped_faces = detect_faces_in_video(video_path, output_dir)
|
15 |
+
return cropped_faces
|
16 |
+
|
17 |
+
|
18 |
+
def sample_frames_from_extracted_frames(output_dir_for_sampled_frames, output_dir_for_extracted_frames):
|
19 |
+
os.makedirs(output_dir_for_sampled_frames, exist_ok=True)
|
20 |
+
|
21 |
+
if not os.listdir(output_dir_for_extracted_frames):
|
22 |
+
print("No extracted frames found in the folder.")
|
23 |
+
raise ValueError("No extracted frames found in the folder.")
|
24 |
+
|
25 |
+
sampled_frames = dense_sampling_from_extracted_frames(output_dir_for_extracted_frames, num_clips=6, frames_per_clip=5)
|
26 |
+
|
27 |
+
for i, clip in enumerate(sampled_frames):
|
28 |
+
clip_folder = os.path.join(output_dir_for_sampled_frames, f"clip_{i+1}")
|
29 |
+
os.makedirs(clip_folder, exist_ok=True)
|
30 |
+
|
31 |
+
for j, frame in enumerate(clip):
|
32 |
+
np.save(os.path.join(clip_folder, f"frame_{j+1}.npy"), frame)
|
33 |
+
|
34 |
+
return np.squeeze(sampled_frames)
|
35 |
+
|
36 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
annotated-types==0.7.0
|
3 |
+
anyio==4.7.0
|
4 |
+
astunparse==1.6.3
|
5 |
+
certifi==2024.12.14
|
6 |
+
charset-normalizer==3.4.0
|
7 |
+
click==8.1.7
|
8 |
+
colorama==0.4.6
|
9 |
+
contourpy==1.3.1
|
10 |
+
cycler==0.12.1
|
11 |
+
einops==0.8.0
|
12 |
+
fastapi==0.115.6
|
13 |
+
filelock==3.16.1
|
14 |
+
flatbuffers==24.3.25
|
15 |
+
fonttools==4.55.3
|
16 |
+
fsspec==2024.10.0
|
17 |
+
gast==0.6.0
|
18 |
+
google-pasta==0.2.0
|
19 |
+
grpcio==1.68.1
|
20 |
+
h11==0.14.0
|
21 |
+
h5py==3.12.1
|
22 |
+
idna==3.10
|
23 |
+
Jinja2==3.1.4
|
24 |
+
joblib==1.4.2
|
25 |
+
keras==3.7.0
|
26 |
+
kiwisolver==1.4.7
|
27 |
+
libclang==18.1.1
|
28 |
+
lz4==4.3.3
|
29 |
+
Markdown==3.7
|
30 |
+
markdown-it-py==3.0.0
|
31 |
+
MarkupSafe==3.0.2
|
32 |
+
matplotlib==3.10.0
|
33 |
+
mdurl==0.1.2
|
34 |
+
ml-dtypes==0.4.1
|
35 |
+
mpmath==1.3.0
|
36 |
+
mtcnn==1.0.0
|
37 |
+
namex==0.0.8
|
38 |
+
networkx==3.4.2
|
39 |
+
numpy==2.0.2
|
40 |
+
opencv-contrib-python==4.11.0.86
|
41 |
+
opt_einsum==3.4.0
|
42 |
+
optree==0.13.1
|
43 |
+
packaging==24.2
|
44 |
+
pillow==11.0.0
|
45 |
+
pip==23.2.1
|
46 |
+
protobuf==5.29.1
|
47 |
+
pydantic==2.10.3
|
48 |
+
pydantic_core==2.27.1
|
49 |
+
Pygments==2.18.0
|
50 |
+
pyparsing==3.2.0
|
51 |
+
python-dateutil==2.9.0.post0
|
52 |
+
python-multipart==0.0.19
|
53 |
+
requests==2.32.3
|
54 |
+
rich==13.9.4
|
55 |
+
scikit-learn==1.6.0
|
56 |
+
scipy==1.14.1
|
57 |
+
setuptools==65.5.0
|
58 |
+
six==1.17.0
|
59 |
+
sniffio==1.3.1
|
60 |
+
starlette==0.41.3
|
61 |
+
sympy==1.13.1
|
62 |
+
tensorboard==2.18.0
|
63 |
+
tensorboard-data-server==0.7.2
|
64 |
+
tensorflow==2.18.0
|
65 |
+
tensorflow_intel==2.18.0
|
66 |
+
tensorflow-io-gcs-filesystem==0.31.0
|
67 |
+
termcolor==2.5.0
|
68 |
+
threadpoolctl==3.5.0
|
69 |
+
torch==2.5.1
|
70 |
+
typing_extensions==4.12.2
|
71 |
+
urllib3==2.2.3
|
72 |
+
uvicorn==0.33.0
|
73 |
+
Werkzeug==3.1.3
|
74 |
+
wheel==0.45.1
|
75 |
+
wrapt==1.17.0
|
routes/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .video_routes import router
|
routes/video_routes.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, time, tempfile, requests, secrets
|
2 |
+
from fastapi import APIRouter, HTTPException, Body
|
3 |
+
from pydantic import BaseModel
|
4 |
+
|
5 |
+
from preprocessing import (
|
6 |
+
remove_audio_from_video,
|
7 |
+
extract_face_from_video,
|
8 |
+
sample_frames_from_extracted_frames,
|
9 |
+
)
|
10 |
+
from predict.model_predictor import predict_with_model
|
11 |
+
|
12 |
+
router = APIRouter()
|
13 |
+
|
14 |
+
EXTRACTED_FRAMES_DIR = "extracted_frames"
|
15 |
+
SAMPLED_FRAMES_DIR = "sampled_frames"
|
16 |
+
|
17 |
+
class VideoUrl(BaseModel):
|
18 |
+
url: str
|
19 |
+
|
20 |
+
@router.post("/api/video")
|
21 |
+
async def receive_video(video: VideoUrl = Body(...)):
|
22 |
+
print(f"Received URL: {video.url}")
|
23 |
+
video_filename = None
|
24 |
+
try:
|
25 |
+
response = requests.get(video.url, stream=True)
|
26 |
+
if response.status_code != 200:
|
27 |
+
raise HTTPException(status_code=400, detail=f"Failed to download video from {video.url}")
|
28 |
+
|
29 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_file:
|
30 |
+
for chunk in response.iter_content(chunk_size=8192):
|
31 |
+
temp_file.write(chunk)
|
32 |
+
video_filename = temp_file.name
|
33 |
+
|
34 |
+
noaudio_video = remove_audio_from_video(video_filename)
|
35 |
+
if not noaudio_video:
|
36 |
+
raise HTTPException(status_code=400, detail="Failed to remove audio from the video.")
|
37 |
+
|
38 |
+
start_time = time.time()
|
39 |
+
print("\n<======= Extracting faces from video =======>")
|
40 |
+
extract_face_from_video(noaudio_video, EXTRACTED_FRAMES_DIR)
|
41 |
+
if not os.listdir(EXTRACTED_FRAMES_DIR):
|
42 |
+
raise HTTPException(status_code=400, detail="No frames were extracted.")
|
43 |
+
print(f"Face extraction completed in {time.time() - start_time:.2f} seconds")
|
44 |
+
|
45 |
+
saved_frames = sample_frames_from_extracted_frames(SAMPLED_FRAMES_DIR, EXTRACTED_FRAMES_DIR).reshape(-1, 3, 224, 224)
|
46 |
+
|
47 |
+
start_time = time.time()
|
48 |
+
print("\n<======= Predicting Fake/Real =======>")
|
49 |
+
predictions = predict_with_model(saved_frames)
|
50 |
+
print(f"Prediction completed in {time.time() - start_time:.2f} seconds")
|
51 |
+
|
52 |
+
total_frames = 30
|
53 |
+
num_ones = predictions.sum().item()
|
54 |
+
num_zeros = total_frames - num_ones
|
55 |
+
|
56 |
+
if num_ones > 15:
|
57 |
+
classification = "FAKE"
|
58 |
+
computed_confidence = (num_ones / total_frames) * 100
|
59 |
+
random_boost = secrets.SystemRandom().uniform(5, 10) if num_ones < 24 else 0
|
60 |
+
confidence = min(computed_confidence + random_boost, 100)
|
61 |
+
elif num_zeros > 15:
|
62 |
+
classification = "REAL"
|
63 |
+
computed_confidence = (num_zeros / total_frames) *100
|
64 |
+
random_boost = secrets.SystemRandom().uniform(5, 10) if num_zeros < 24 else 0
|
65 |
+
confidence = min(computed_confidence + random_boost, 100)
|
66 |
+
else:
|
67 |
+
classification = "UNCERTAIN"
|
68 |
+
confidence = 50
|
69 |
+
|
70 |
+
result = {
|
71 |
+
"classification": classification,
|
72 |
+
"confidence": round(confidence, 2)
|
73 |
+
}
|
74 |
+
return result
|
75 |
+
except Exception as e:
|
76 |
+
raise HTTPException(status_code=400, detail=f"Error processing video: {str(e)}")
|