Spaces:

dad1909
/

CyberCode

Paused

App Files Files Community

s3777091 commited on Jun 5, 2024

Commit

4deeced

1 Parent(s): 9c1bd77

start

Browse files

Files changed (2) hide show

app.py +153 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import os
+import torch
+from unsloth import FastLanguageModel, is_bfloat16_supported
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from datasets import load_dataset
+import gradio as gr
+max_seq_length = 4096
+dtype = None
+load_in_4bit = True
+hf_token = os.getenv("Token")
+print("Starting model and tokenizer loading...")
+# Load the model and tokenizer
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
+    max_seq_length=max_seq_length,
+    dtype=dtype,
+    load_in_4bit=load_in_4bit,
+    token=hf_token
+)
+print("Model and tokenizer loaded successfully.")
+print("Configuring PEFT model...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=16,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    lora_alpha=16,
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+    use_rslora=False,
+    loftq_config=None,
+)
+print("PEFT model configured.")
+# Updated alpaca_prompt for different types
+alpaca_prompt = {
+    "learning_from": """Below is a CVE definition.
+### CVE definition:
+{}
+### detail CVE:
+{}""",
+    "definition": """Below is a definition about software vulnerability. Explain it.
+### Definition:
+{}
+### Explanation:
+{}""",
+    "code_vulnerability": """Below is a code snippet. Identify the line of code that is vulnerable and describe the type of software vulnerability.
+### Code Snippet:
+{}
+### Vulnerability solution:
+{}"""
+}
+EOS_TOKEN = tokenizer.eos_token
+def detect_prompt_type(instruction):
+    if instruction.startswith("what is code vulnerable of this code:"):
+        return "code_vulnerability"
+    elif instruction.startswith("Learning from"):
+        return "learning_from"
+    elif instruction.startswith("what is"):
+        return "definition"
+    else:
+        return "unknown"
+def formatting_prompts_func(examples):
+    instructions = examples["instruction"]
+    outputs = examples["output"]
+    texts = []
+    for instruction, output in zip(instructions, outputs):
+        prompt_type = detect_prompt_type(instruction)
+        if prompt_type in alpaca_prompt:
+            prompt = alpaca_prompt[prompt_type].format(instruction, output)
+        else:
+            prompt = instruction + "\n\n" + output
+        text = prompt + EOS_TOKEN
+        texts.append(text)
+    return {"text": texts}
+print("Loading dataset...")
+dataset = load_dataset("dad1909/DCSV", split="train")
+print("Dataset loaded successfully.")
+print("Applying formatting function to the dataset...")
+dataset = dataset.map(formatting_prompts_func, batched=True)
+print("Formatting function applied.")
+print("Initializing trainer...")
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=max_seq_length,
+    dataset_num_proc=2,
+    packing=False,
+    args=TrainingArguments(
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=2,
+        learning_rate=2e-4,
+        fp16=not is_bfloat16_supported(),
+        bf16=is_bfloat16_supported(),
+        warmup_steps=5,
+        logging_steps=10,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        lr_scheduler_type="linear",
+        seed=3407,
+        output_dir="outputs",
+    ),
+)
+print("Trainer initialized.")
+print("Starting training...")
+trainer_stats = trainer.train()
+print("Training completed.")
+print("Saving the trained model...")
+model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
+print("Model saved successfully.")
+print("Pushing the model to the hub...")
+model.push_to_hub_merged(
+    "CyberSentinel-16bit",
+    tokenizer,
+    save_method="merged_16bit",
+    token=True
+)
+print("Model pushed to hub successfully.")
+# Gradio app
+print("Launching Gradio app...")
+def greet(name):
+    return "Hello " + name + "!!"
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+demo.launch()
+print("Gradio app launched.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
+xformers[cuda]
+torch
+transformers
+datasets
+gradio
+trl
+peft
+accelerate
+bitsandbytes
+huggingface_hub