Spaces:

amphion
/

DeepfakeDetection

Running on Zero

App Files Files Community

wli3221134 commited on 9 days ago

Commit

b82b421

verified ·

1 Parent(s): 87c8d2e

Upload 3 files

Browse files

Files changed (3) hide show

app.py +16 -59
dataset.py +24 -122
model.py +52 -363

app.py CHANGED Viewed

@@ -2,9 +2,11 @@ import spaces
 import gradio as gr
 import os
 import torch
-from model import Wav2Vec2BERT_Llama  # 自定义模型模块
 import dataset  # 自定义数据集模块
 from huggingface_hub import hf_hub_download
 @spaces.GPU
 def dummy(): # just a dummy
@@ -14,7 +16,7 @@ def dummy(): # just a dummy
 def load_model():
     checkpoint_path = hf_hub_download(
         repo_id="amphion/deepfake_detection",
-        filename="checkpoints_wav2vec2bert_ft_llama_labels_ASVspoof2019_RandomPrompts_6/model_checkpoint.pth",
         repo_type="model"
     )
     if not os.path.exists(checkpoint_path):
@@ -33,12 +35,12 @@ def detect_on_gpu(dataset):
     print(f"使用设备: {device}")
     print("正在初始化模型...")
-    model = Wav2Vec2BERT_Llama().to(device)
     print(f"正在加载模型权重: {checkpoint_path}")
     checkpoint = torch.load(checkpoint_path, map_location=device)
     model_state_dict = checkpoint['model_state_dict']
-    threshold = 0.8
     print(f"检测阈值设置为: {threshold}")
     # 处理模型状态字典的 key
@@ -53,45 +55,18 @@ def detect_on_gpu(dataset):
     model.eval()
     print("模型加载完成，进入评估模式")
     print("\n开始处理音频数据...")
     with torch.no_grad():
         for batch_idx, batch in enumerate(dataset):
             print(f"\n处理批次 {batch_idx + 1}")
-            print("准备主特征...")
-            main_features = {
-                'input_features': batch['main_features']['input_features'].to(device),
-                'attention_mask': batch['main_features']['attention_mask'].to(device)
-            }
-            print(f"主特征形状: {main_features['input_features'].shape}")
-            if len(batch['prompt_features']) > 0:
-                print("\n准备提示特征...")
-                prompt_features = [{
-                    'input_features': pf['input_features'].to(device),
-                    'attention_mask': pf['attention_mask'].to(device)
-                } for pf in batch['prompt_features']]
-                print(f"提示特征数量: {len(prompt_features)}")
-                print(f"第一个提示特征形状: {prompt_features[0]['input_features'].shape}")
-                print("\n准备提示标签...")
-                prompt_labels = batch['prompt_labels'].to(device)
-                print(f"提示标签形状: {prompt_labels.shape}")
-                print(f"提示标签值: {prompt_labels}")
-            else:
-                prompt_features = []
-                prompt_labels = []
-            print("\n执行模型推理...")
-            outputs = model({
-                'main_features': main_features,
-                'prompt_features': prompt_features,
-                'prompt_labels': prompt_labels
-            })
-            print("\n处理模型输出...")
-            avg_scores = outputs['avg_logits'].softmax(dim=-1)
-            deepfake_scores = avg_scores[:, 1].cpu()
             is_fake = deepfake_scores[0].item() > threshold
             result = {"is_fake": is_fake, "confidence": deepfake_scores[0] if is_fake else 1-deepfake_scores[0]}
@@ -101,28 +76,10 @@ def detect_on_gpu(dataset):
     print("\n=== 检测完成 ===")
     return result
-# 修改音频伪造检测主函数
-# def audio_deepfake_detection(demonstrations, query_audio_path):
-#     demonstration_paths = [audio[0] for audio in demonstrations if audio[0] is not None]
-#     demonstration_labels = [audio[1] for audio in demonstrations if audio[1] is not None]
-#     if len(demonstration_paths) != len(demonstration_labels):
-#         demonstration_labels = demonstration_labels[:len(demonstration_paths)]
-#     # 数据集处理
-#     audio_dataset = dataset.DemoDataset(demonstration_paths, demonstration_labels, query_audio_path)
-#     # 调用 GPU 检测函数
-#     result = detect_on_gpu(audio_dataset)
-#     return {
-#         "Is AI Generated": result["is_fake"],
-#         "Confidence": f"{100*result['confidence']:.2f}%"
-#     }
-# 0 demonstrations
-def audio_deepfake_detection(query_audio_path):
     # 数据集处理
-    audio_dataset = dataset.DemoDataset([], [], query_audio_path)
     # 调用 GPU 检测函数
     result = detect_on_gpu(audio_dataset)

 import gradio as gr
 import os
 import torch
+from model import SpoofVerificationModel  # 自定义模型模块
 import dataset  # 自定义数据集模块
 from huggingface_hub import hf_hub_download
+from transformers import AutoFeatureExtractor
 @spaces.GPU
 def dummy(): # just a dummy
 def load_model():
     checkpoint_path = hf_hub_download(
         repo_id="amphion/deepfake_detection",
+        filename="checkpoints_w2v-bert_SpoofVerification_MultiDataset/model_checkpoint_4_new.pth",
         repo_type="model"
     )
     if not os.path.exists(checkpoint_path):
     print(f"使用设备: {device}")
     print("正在初始化模型...")
+    model = SpoofVerificationModel().to(device)
     print(f"正在加载模型权重: {checkpoint_path}")
     checkpoint = torch.load(checkpoint_path, map_location=device)
     model_state_dict = checkpoint['model_state_dict']
+    threshold = 0.5
     print(f"检测阈值设置为: {threshold}")
     # 处理模型状态字典的 key
     model.eval()
     print("模型加载完成，进入评估模式")
+    feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
     print("\n开始处理音频数据...")
     with torch.no_grad():
         for batch_idx, batch in enumerate(dataset):
             print(f"\n处理批次 {batch_idx + 1}")
+            waveforms = batch['waveforms'].numpy() # [B, T]
+            features = feature_extractor(waveforms, sampling_rate=16000, return_attention_mask=True, padding_value=0, return_tensors="pt").to(device)
+            outputs = model(features)
+            deepfake_logits = outputs['deepfake_logits']
+            deepfake_scores = deepfake_logits.float().softmax(dim=-1)[:, 1].contiguous()
             is_fake = deepfake_scores[0].item() > threshold
             result = {"is_fake": is_fake, "confidence": deepfake_scores[0] if is_fake else 1-deepfake_scores[0]}
     print("\n=== 检测完成 ===")
     return result
+def audio_deepfake_detection(audio_path):
     # 数据集处理
+    audio_dataset = dataset.DemoDataset(audio_path)
     # 调用 GPU 检测函数
     result = detect_on_gpu(audio_dataset)

dataset.py CHANGED Viewed

@@ -1,133 +1,35 @@
-import torch
 from torch.utils.data import Dataset
-from transformers import AutoFeatureExtractor
-import os
 import librosa
 import numpy as np
-class DemoDataset(Dataset):
-    def __init__(self, demonstration_paths, demonstration_labels, query_path, sample_rate=16000):
-        self.sample_rate = sample_rate
-        self.query_path = query_path
-        # Convert to list if single path
-        self.demonstration_paths = demonstration_paths
-        self.demonstration_labels = [0 if label == 'bonafide' else 1 for label in demonstration_labels]
-        # Load feature extractor
-        self.feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
-    def load_pad(self, path, max_length=64000):
-        """Load and pad audio file"""
-        X, sr = librosa.load(path, sr=self.sample_rate)
-        X = self.pad(X, max_length)
-        return X
-    def pad(self, x, max_len=64000):
-        """Pad audio to fixed length"""
-        x_len = x.shape[0]
-        if x_len >= max_len:
             return x[:max_len]
-        pad_length = max_len - x_len
-        return np.concatenate([x, np.zeros(pad_length)], axis=0)
     def __len__(self):
-        return 1  # Only one query audio
     def __getitem__(self, idx):
-        # Load query audio
-        query_waveform = self.load_pad(self.query_path)
-        query_waveform = torch.from_numpy(query_waveform).float()
-        if len(query_waveform.shape) == 1:
-            query_waveform = query_waveform.unsqueeze(0)
-        # Extract features for query audio
-        main_features = self.feature_extractor(
-            query_waveform,
-            sampling_rate=self.sample_rate,
-            padding=True,
-            return_attention_mask=True,
-            return_tensors="pt"
-        )
-        # Process demonstration audios
-        prompt_features = []
-        for demo_path in self.demonstration_paths:
-            # Load demonstration audio
-            demo_waveform = self.load_pad(demo_path)
-            demo_waveform = torch.from_numpy(demo_waveform).float()
-            if len(demo_waveform.shape) == 1:
-                demo_waveform = demo_waveform.unsqueeze(0)
-            # Extract features
-            prompt_feature = self.feature_extractor(
-                demo_waveform,
-                sampling_rate=self.sample_rate,
-                padding=True,
-                return_attention_mask=True,
-                return_tensors="pt"
-            )
-            prompt_features.append(prompt_feature)
-        prompt_labels = torch.tensor([self.demonstration_labels], dtype=torch.long)
-        return {
-            'main_features': main_features,
-            'prompt_features': prompt_features,
-            'prompt_labels': prompt_labels,
-            'file_name': os.path.basename(self.query_path),
-            'file_path': self.query_path
-        }
-def collate_fn(batch):
-    """
-    Collate function for dataloader
-    Args:
-        batch: List containing dictionaries with:
-            - main_features: feature extractor output
-            - prompt_features: list of feature extractor outputs
-            - file_name: file name
-            - file_path: file path
-    """
-    batch_size = len(batch)
-    # Process main features
-    main_features_keys = batch[0]['main_features'].keys()
-    main_features = {}
-    for key in main_features_keys:
-        main_features[key] = torch.cat([item['main_features'][key] for item in batch], dim=0)
-    # Get number of prompts
-    num_prompts = len(batch[0]['prompt_features'])
-    # Process prompt features
-    prompt_features = []
-    for i in range(num_prompts):
-        prompt_feature = {}
-        for key in main_features_keys:
-            prompt_feature[key] = torch.cat([item['prompt_features'][i][key] for item in batch], dim=0)
-        prompt_features.append(prompt_feature)
-    # Collect file names and paths
-    file_names = [item['file_name'] for item in batch]
-    file_paths = [item['file_path'] for item in batch]
-    # 确保 prompt_labels 的形状正确 [batch_size, num_prompts]
-    prompt_labels = torch.cat([item['prompt_labels'] for item in batch], dim=0)
-    return {
-        'main_features': main_features,
-        'prompt_features': prompt_features,
-        'prompt_labels': prompt_labels,
-        'file_names': file_names,
-        'file_paths': file_paths
-    }
-if __name__ == '__main__':
-    # Test the dataset
-    demo_paths = ["examples/demo1.wav", "examples/demo2.wav"]
-    query_path = "examples/query.wav"
-    dataset = DemoDataset(demo_paths, query_path)
-    print(dataset[0])

 from torch.utils.data import Dataset
 import librosa
 import numpy as np
+from torch import Tensor
+def pad(x, max_len=64600, random_clip=True):
+    x_len = x.shape[0]
+    if x_len > max_len:
+        # random clip
+        if random_clip:
+            start_idx = np.random.randint(0, x_len - max_len)
+            return x[start_idx:start_idx + max_len]
+        else:
             return x[:max_len]
+    # need to pad
+    num_repeats = int(max_len / x_len)+1
+    padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
+    return padded_x
+class DemoDataset(Dataset):
+    def __init__(self, path):
+        self.path = path
     def __len__(self):
+        return 1
     def __getitem__(self, idx):
+        waveform, sample_rate = librosa.load(self.path, sr=16000)
+        waveform_pad = pad(waveform)
+        waveform_tensor = Tensor(waveform_pad)
+        return {
+            'waveforms': waveform_tensor,
+        }

model.py CHANGED Viewed

@@ -1,380 +1,69 @@
-import torch
 import torch.nn as nn
 from transformers import Wav2Vec2BertModel
-from llama_nar import LlamaNAREmb
-from transformers import LlamaConfig
-import time
-import torch.nn.functional as F
-from huggingface_hub import hf_hub_download
-class Wav2Vec2BERT_Llama(nn.Module):
-    def __init__(self):
-        super().__init__()
-        # 1. 加载预训练模型
-        self.wav2vec2bert = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", output_hidden_states=True)
-        # 2. 选择性冻结参数
-        for name, param in self.wav2vec2bert.named_parameters():
-            # 冻结所有FFN1 (保留FFN2的适应能力)
-            if 'ffn1' in name:
-                param.requires_grad = False
-            # 冻结多头注意力中的K,V投影
-            if any(proj in name for proj in ['linear_k', 'linear_v']):
-                param.requires_grad = False
-            # 冻结distance_embedding
-            if 'distance_embedding' in name:
-                param.requires_grad = False
-            # 冻结所有卷积相关模块
-            if any(conv_name in name for conv_name in [
-                'conv_module', 'pointwise_conv', 'depthwise_conv',
-                'feature_extractor', 'pos_conv_embed', 'conv_layers'
-            ]):
-                param.requires_grad = False
-        # 3. 减小Llama模型规模
-        self.llama_nar = LlamaNAREmb(
-            config=LlamaConfig(
-                hidden_size=512,
-                num_attention_heads=8,
-                num_hidden_layers=8,
-            ),
-            num_heads=8,
-            num_layers=8,
-            hidden_size=512
-        )
-        # 4. 降维投影层
-        self.projection = nn.Sequential(
-            nn.Linear(1024, 512),
-            nn.LayerNorm(512)
-        )
-        # 5. 简化分类头
-        self.classifier = nn.Sequential(
-            nn.Linear(512, 128),
             nn.ReLU(),
-            nn.Dropout(0.1),
-            nn.Linear(128, 2)
         )
-        # 6. 减小embedding维度
-        self.label_embedding = nn.Embedding(num_embeddings=2, embedding_dim=512)
-        # 7. 简化特征处理层
-        self.feature_processor = nn.Sequential(
-            nn.Linear(512, 512),
-            nn.LayerNorm(512),
             nn.ReLU(),
-            nn.Dropout(0.1)
         )
-        # 8. 减小特殊token的维度
-        self.special_tokens = nn.Parameter(torch.randn(4, 512))
-    def _fuse_layers(self, hidden_states):
-        # 修改特征融合方法
-        def downsample_sequence(sequence, factor=10):
-            """对序列进行下采样"""
-            batch_size, seq_len, hidden_size = sequence.shape
-            # 确保序列长度可以被因子整除
-            new_len = seq_len // factor
-            padded_len = new_len * factor
-            if seq_len > padded_len:
-                sequence = sequence[:, :padded_len, :]
-            # 重塑张量并进行平均池化 [batch_size, new_len, factor, hidden_size]
-            reshaped = sequence.reshape(batch_size, new_len, factor, hidden_size)
-            downsampled = torch.mean(reshaped, dim=2)  # [batch_size, new_len, hidden_size]
-            return downsampled
-        # 1. 获取最后一层特征并进行下采样
-        last_layer = hidden_states[-1]  # [batch_size, seq_len, 1024]
-        downsampled_features = downsample_sequence(last_layer)  # [batch_size, seq_len//10, 1024]
-        # 2. 投影到512维度
-        projected_features = self.projection(downsampled_features)  # [batch_size, seq_len//10, 512]
-        return projected_features  # 不再需要unsqueeze，因为已经保留了序列维度
-    def forward(self, batch):
-        main_output = self.wav2vec2bert(
-            **batch['main_features']
-        )
-        fused_features = self._fuse_layers(main_output.hidden_states)
-        fused_features = self.feature_processor(fused_features)
-        if ('prompt_labels' in batch and
-            batch['prompt_labels'] is not None and
-            'prompt_features' in batch and
-            batch['prompt_features'] and
-            len(batch['prompt_features']) > 0):
-            batch_size, num_prompts = batch['prompt_labels'].shape
-            # 重塑特征以批量处理
-            prompt_features = batch['prompt_features']
-            all_prompt_outputs = []
-            for i in range(num_prompts):
-                prompt_output = self.wav2vec2bert(
-                    **prompt_features[i]
-                )
-                all_prompt_outputs.append(self._fuse_layers(prompt_output.hidden_states))
-            if all_prompt_outputs:
-                fused_prompts = torch.stack([
-                    self.feature_processor(p) for p in all_prompt_outputs
-                ], dim=1)  # [batch_size, num_prompts, seq_len, hidden_size]
-                # 获取label embeddings并扩展到对应序列长度
-                label_embs = self.label_embedding(batch['prompt_labels'])  # [batch_size, num_prompts, 512]
-                prompt_embeddings = []
-                for i in range(batch_size):
-                    sequence = []
-                    # 添加示例prompts
-                    for j in range(num_prompts):
-                        prompt_seq_len = fused_prompts[i, j].size(0)  # 获取当前prompt的序列长度
-                        sequence.append(self.special_tokens[1].expand(1, -1))  # [PROMPT]
-                        sequence.append(self.special_tokens[2].expand(1, -1))  # [AUDIO]
-                        sequence.append(fused_prompts[i, j])  # [seq_len, hidden_size]
-                        sequence.append(self.special_tokens[3].expand(1, -1))  # [LABEL]
-                        # 扩展label embedding到与音频特征相同的长度
-                        expanded_label = label_embs[i, j].unsqueeze(0).expand(prompt_seq_len, -1)
-                        sequence.append(expanded_label)  # [seq_len, hidden_size]
-                        sequence.append(self.special_tokens[0].expand(1, -1))  # [SEP]
-                    # 添加待预测的主特征
-                    main_seq_len = fused_features[i].size(0)  # 获取主特征的序列长度
-                    sequence.append(self.special_tokens[1].expand(1, -1))  # [PROMPT]
-                    sequence.append(self.special_tokens[2].expand(1, -1))  # [AUDIO]
-                    sequence.append(fused_features[i])  # [main_seq_len, hidden_size]
-                    sequence.append(self.special_tokens[3].expand(1, -1))  # [LABEL]
-                    # 预测位置使用零向量，长度与主特征相同
-                    sequence.append(torch.zeros(main_seq_len, fused_features.size(-1)).to(fused_features.device))
-                    prompt_embeddings.append(torch.cat(sequence, dim=0))
-                prompt_embeddings = torch.stack(prompt_embeddings, dim=0)
-        else:
-            # 简化无prompt情况的处理
-            batch_size = fused_features.size(0)
-            main_seq_len = fused_features.size(1)  # 直接获取主特征序列长度
-            # 构建序列 [batch_size, total_len, hidden_size]
-            prompt_embeddings = torch.cat([
-                self.special_tokens[1].expand(batch_size, 1, -1),  # [PROMPT]
-                self.special_tokens[2].expand(batch_size, 1, -1),  # [AUDIO]
-                fused_features,  # [batch_size, main_seq_len, hidden_size]
-                self.special_tokens[3].expand(batch_size, 1, -1),  # [LABEL]
-                torch.zeros(batch_size, main_seq_len, fused_features.size(-1)).to(fused_features.device)  # 预测位置
-            ], dim=1)
-        # 输入到llama_nar
-        output = self.llama_nar(inputs_embeds=prompt_embeddings)
-        # 获取所有预测位置的输出（即最后main_seq_len个位置）
-        pred_pos_embeddings = output[:, -main_seq_len:, :]  # [batch_size, main_seq_len, hidden_size]
-        # 对每一帧进行分类
-        frame_logits = self.classifier(pred_pos_embeddings)  # [batch_size, main_seq_len, 2]
-        # 同时返回帧级别的logits和整体的logits（通过平均得到）
-        avg_embedding = torch.mean(pred_pos_embeddings, dim=1)  # [batch_size, hidden_size]
-        avg_logits = self.classifier(avg_embedding)  # [batch_size, 2]
         return {
-            'frame_logits': frame_logits,  # 每一帧的预测分数
-            'avg_logits': avg_logits       # 整体的预测分数
         }
-if __name__ == '__main__':
-    import torch
-    from torch.utils.data import DataLoader
-    from dataset.train_MultiDataset import train_MultiDataset, collate_fn
-    from tqdm import tqdm
-    import time
-    # 设置设备
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    print(f"\n=== 使用设备: {device} ===")
-    # 初始化模型
-    print("\n=== 初始化模型 ===")
-    model = Wav2Vec2BERT_Llama().to(device)
-    model.eval()  # 设置为评估模式
-    # 打印wav2vec2bert的参数结构
-    print("\n=== Wav2Vec2BERT 参数结构 ===")
-    w2v_params_by_layer = {}
-    total_trainable = 0
-    total_frozen = 0
-    for name, param in model.wav2vec2bert.named_parameters():
-        # 获取主要层名称
-        layer_name = name.split('.')[0]
-        if layer_name not in w2v_params_by_layer:
-            w2v_params_by_layer[layer_name] = {
-                'trainable_params': 0,
-                'frozen_params': 0,
-                'parameter_names': []
-            }
-        # 统计参数
-        if param.requires_grad:
-            w2v_params_by_layer[layer_name]['trainable_params'] += param.numel()
-            total_trainable += param.numel()
-        else:
-            w2v_params_by_layer[layer_name]['frozen_params'] += param.numel()
-            total_frozen += param.numel()
-        w2v_params_by_layer[layer_name]['parameter_names'].append(name)
-    # 打印每层的详细信息
-    print("\n各层参数统计:")
-    for layer_name, info in w2v_params_by_layer.items():
-        trainable_mb = info['trainable_params'] / 1024 / 1024
-        frozen_mb = info['frozen_params'] / 1024 / 1024
-        total_mb = (info['trainable_params'] + info['frozen_params']) / 1024 / 1024
-        print(f"\n{layer_name}:")
-        print(f"  - 总参数量: {total_mb:.2f}MB")
-        print(f"  - 可训练参数: {trainable_mb:.2f}MB")
-        print(f"  - 冻结参数: {frozen_mb:.2f}MB")
-        print(f"  - 参数名称:")
-        for param_name in info['parameter_names']:
-            print(f"    * {param_name}")
-    # 打印总体统计
-    print("\n=== 总体统计 ===")
-    print(f"可训练参数总量: {total_trainable/1024/1024:.2f}MB")
-    print(f"冻结参数总量: {total_frozen/1024/1024:.2f}MB")
-    print(f"参数总量: {(total_trainable + total_frozen)/1024/1024:.2f}MB")
-    print(f"可训练参数占比: {total_trainable/(total_trainable + total_frozen)*100:.2f}%")
-    # 分别统计各个模块的参数量
-    wav2vec2bert_params = sum(p.numel() for p in model.wav2vec2bert.parameters())
-    llama_params = sum(p.numel() for p in model.llama_nar.parameters())
-    other_params = sum(p.numel() for name, p in model.named_parameters()
-                      if not name.startswith('wav2vec2bert.') and not name.startswith('llama_nar.'))
-    total_params = wav2vec2bert_params + llama_params + other_params
-    print(f"\n=== 参数量统计 ===")
-    print(f"Wav2Vec2BERT参数量: {wav2vec2bert_params:,} ({wav2vec2bert_params/1024/1024:.2f}MB)")
-    print(f"LlamaNAR参数量: {llama_params:,} ({llama_params/1024/1024:.2f}MB)")
-    print(f"其他模块参数量: {other_params:,} ({other_params/1024/1024:.2f}MB)")
-    print(f"总参数量: {total_params:,} ({total_params/1024/1024:.2f}MB)")
-    # 计算百分比
-    print(f"\n=== 参数量占比 ===")
-    print(f"Wav2Vec2BERT: {wav2vec2bert_params/total_params*100:.2f}%")
-    print(f"LlamaNAR: {llama_params/total_params*100:.2f}%")
-    print(f"其他模块: {other_params/total_params*100:.2f}%")
-    # 测试运行时间和内存使用
-    print("\n=== 测试运行时间和内存使用 (batch_size=4) ===")
-    batch_size = 4
-    total_samples = 600000
-    # 清空GPU缓存
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        initial_memory = torch.cuda.memory_allocated() / 1024 / 1024
-        print(f"初始GPU内存使用: {initial_memory:.2f}MB")
-    # 初始化数据集
-    print("\n初始化数据集...")
-    ds = train_MultiDataset(max_prompts=3)
-    # 创建DataLoader
-    dl = DataLoader(ds,
-                   batch_size=batch_size,
-                   shuffle=True,
-                   collate_fn=collate_fn,
-                   num_workers=4)
-    print(f"\n数据集大小: {len(ds)}")
-    print(f"批次数量: {len(dl)}")
-    # 计算一个batch的平均时间
-    num_test_batches = 10
-    total_time = 0
-    max_memory = 0
-    print(f"\n测试{num_test_batches}个batch的平均运行时间...")
-    with torch.no_grad():
-        for i, batch in enumerate(tqdm(dl, total=num_test_batches)):
-            if i >= num_test_batches:
-                break
-            # 正确处理字典类型的特征
-            main_features = {
-                'input_features': batch['main_features']['input_features'].to(device),
-                'attention_mask': batch['main_features']['attention_mask'].to(device)
-            }
-            prompt_features = [{
-                'input_features': pf['input_features'].to(device),
-                'attention_mask': pf['attention_mask'].to(device)
-            } for pf in batch['prompt_features']]
-            labels = batch['labels'].to(device)
-            prompt_labels = batch['prompt_labels'].to(device)
-            # 记录开始时间
-            start_time = time.time()
-            # 前向传播
-            outputs = model({
-                'main_features': main_features,
-                'prompt_features': prompt_features,
-                'prompt_labels': prompt_labels
-            })
-            # 确保GPU运算完成
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
-            # 记录结束时间和内存使用
-            end_time = time.time()
-            total_time += (end_time - start_time)
-            if torch.cuda.is_available():
-                current_memory = torch.cuda.memory_allocated() / 1024 / 1024
-                max_memory = max(max_memory, current_memory)
-            # 打印第一个batch的详细信息
-            if i == 0:
-                print("\n=== 第一个Batch的详细信息 ===")
-                print(f"主特征形状: {main_features['input_features'].shape}")
-                print(f"主掩码形状: {main_features['attention_mask'].shape}")
-                print(f"Prompt特征形状: {prompt_features[0]['input_features'].shape}")
-                print(f"Prompt掩码形状: {prompt_features[0]['attention_mask'].shape}")
-                print(f"标签形状: {labels.shape}")
-                print(f"Prompt标签形状: {prompt_labels.shape}")
-                print(f"模型输出形状: {outputs.shape}")
-                print(f"输出logits范围: [{outputs.min().item():.3f}, {outputs.max().item():.3f}]")
-    # 计算和打印统计信息
-    avg_time = total_time / num_test_batches
-    print(f"\n=== 性能统计 ===")
-    print(f"平均每个batch处理时间: {avg_time:.4f}秒")
-    print(f"估计处理{total_samples}个样本需要: {(total_samples/batch_size*avg_time/3600):.2f}小时")
-    if torch.cuda.is_available():
-        print(f"最大GPU内存使用: {max_memory:.2f}MB")
-        print(f"GPU内存增长: {max_memory - initial_memory:.2f}MB")
-    print("\n测试完成!")

 import torch.nn as nn
 from transformers import Wav2Vec2BertModel
+class SpoofVerificationModel(nn.Module):
+    def __init__(self, w2v_path, num_types=49):
+        super(SpoofVerificationModel, self).__init__()
+        self.wav2vec2 = Wav2Vec2BertModel.from_pretrained(w2v_path, output_hidden_states=True)
+        self.wav2vec_config = self.wav2vec2.config
+        self.deepfake_embed = nn.Linear(self.wav2vec2.config.hidden_size, 1024)
+        self.type_embed = nn.Linear(self.wav2vec2.config.hidden_size, 1024)
+        self.deepfake_classifier = nn.Sequential(
             nn.ReLU(),
+            nn.Linear(1024, 2)
         )
+        self.type_classifier = nn.Sequential(
             nn.ReLU(),
+            nn.Linear(1024, num_types)
         )
+        # self.deepfake_classifier = nn.Sequential(
+        #     nn.Linear(self.wav2vec2.config.hidden_size, 1024),
+        #     nn.ReLU(),
+        #     nn.Linear(1024, 2)
+        # )
+        # self.type_classifier = nn.Sequential(
+        #     nn.Linear(self.wav2vec2.config.hidden_size, 1024),
+        #     nn.ReLU(),
+        #     nn.Linear(1024, num_types)
+        # )
+    def forward(self, audio_features):
+        audio_features = self.wav2vec2(**audio_features) # [B, T, D]
+        audio_features = audio_features.last_hidden_state # (B, T, D)
+        audio_features = audio_features.mean(dim=1) # (B, D)
+        # deepfake_logits = self.deepfake_classifier(audio_features)
+        # type_logits = self.type_classifier(audio_features)
+        deepfake_emb = self.deepfake_embed(audio_features)
+        type_emb = self.type_embed(audio_features)
+        deepfake_logits = self.deepfake_classifier(deepfake_emb)
+        type_logits = self.type_classifier(type_emb)
         return {
+            'deepfake_logits': deepfake_logits,
+            'type_logits': type_logits,
+            'embeddings': audio_features,
+            'deepfake_embed': deepfake_emb,  # 新增embedding输出
+            'type_embed': type_emb           # 新增embedding输出
         }
+        # return {
+        #     'deepfake_logits': deepfake_logits,
+        #     'type_logits': type_logits,
+        #     'embeddings': audio_features
+        # }
+    def print_parameters_info(self):
+        print(f"wav2vec2 parameters: {sum(p.numel() for p in self.wav2vec2.parameters())/1e6:.2f}M")
+        print(f"deepfake_classifier parameters: {sum(p.numel() for p in self.deepfake_classifier.parameters())/1e6:.2f}M")
+        print(f"type_classifier parameters: {sum(p.numel() for p in self.type_classifier.parameters())/1e6:.2f}M")