File size: 3,111 Bytes
e991118
 
97bee26
 
e991118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ff6193
e991118
 
 
 
 
 
 
 
 
 
 
 
 
 
f747801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import random
import numpy as np

from ..core.detector import WmDetector

def generate_pastel_color():
    """Generate a pastel color in HSL format."""
    h = random.random()  # Random hue
    s = 0.3 + random.random() * 0.2  # Saturation between 0.3-0.5
    l = 0.8 + random.random() * 0.1  # Lightness between 0.8-0.9
    return f"hsl({h*360}, {s*100}%, {l*100}%)"

def color_from_score(score: float):
    """
    Take a score between 0 and 1 and output the color.
    If the score is nan, returns a pastel gray color
    If the score is close to 0, return pastel red, if the score is close to 1 returns pastel green.
    """
    if isinstance(score, float) and not np.isnan(score):
        # Red for low scores, green for high scores
        h = 0 if score < 0.5 else 120  # 0 = red, 120 = green
        s = 0.3 + 0.2 * abs(2 * score - 1)  # Higher saturation for extreme values
        l = 0.85  # Keep lightness constant for pastel colors
        return f"hsl({h}, {s*100}%, {l*100}%)"
    return "hsl(0, 0%, 85%)"  # Pastel gray for NaN

def get_token_details(
    text: str, 
    detector: WmDetector
) -> tuple:
    """
    Run the detector on the text and outputs everything needed for display
    """
    # Get scores for each token
    token_details = detector.get_details(text)

    # Get p-values for each token
    pvalues, aux_info = detector.get_pvalues_by_tok(token_details)
    
    display_info = []
    for token_detail, pvalue in zip(token_details, pvalues):
        score = token_detail['score'] if token_detail['is_scored'] else float('nan')
        # Convert numpy types to native Python types
        if isinstance(score, (np.floating, np.integer)):
            score = float(score)
        if isinstance(pvalue, (np.floating, np.integer)):
            pvalue = float(pvalue)
            
        display_info.append({
            'is_scored': token_detail['is_scored'],
            'token': token_detail['token_text'],
            'color': color_from_score(score),
            'score': score,
            'pvalue': pvalue
        })
    
    # Add summary statistics and convert numpy types to native Python types
    display_info.append({
        'final_score': float(aux_info['final_score']),
        'ntoks_scored': int(aux_info['ntoks_scored']),
        'final_pvalue': float(aux_info['final_pvalue'])
    })
    
    return display_info

def template_prompt(instruction: str, prompt_type: str = "smollm") -> str:
    """Template a prompt according to the model's format.
    
    Args:
        instruction: The raw prompt/instruction to template
        prompt_type: Type of prompt format (smollm, alpaca)

    Returns:
        The formatted prompt ready for the model
    """
    if prompt_type == "alpaca":
        return instruction
    elif prompt_type == "smollm":
        prompt = "<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n"
        prompt += f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
        return prompt
    else:
        raise ValueError(f"Prompt type {prompt_type} not supported")