Spaces:

WordLift
/

brand-llms

Sleeping

App Files Files Community

cyberandy commited on Dec 3, 2024

Commit

6465b33

verified ·

1 Parent(s): a24593e

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -30

app.py CHANGED Viewed

@@ -2,12 +2,19 @@ import gradio as gr
 import requests
 from typing import Dict, Tuple, List
 from operator import itemgetter
-def get_top_features(text: str, k: int = 5) -> Dict:
     url = "https://www.neuronpedia.org/api/search-with-topk"
     payload = {
         "modelId": "gemma-2-2b",
-        "layer": "0-gemmascope-mlp-16k",
         "sourceSet": "gemma-scope",
         "text": text,
         "k": k,
@@ -15,43 +22,80 @@ def get_top_features(text: str, k: int = 5) -> Dict:
         "ignoreBos": True
     }
-    response = requests.post(
-        url,
-        headers={"Content-Type": "application/json"},
-        json=payload
-    )
-    return response.json() if response.status_code == 200 else None
-def format_output(data: Dict) -> Tuple[str, str, str]:
     if not data:
         return "Error analyzing text", "", ""
-    # Collect all features from all tokens
-    all_features = []
-    for result in data['results']:
-        token = result['token']
-        if token == '<bos>':
-            continue
-        for feature in result['top_features']:
-            all_features.append({
-                'token': token,
-                'feature_id': feature['feature_index'],
-                'activation': feature['activation_value'],
-                'feature_data': feature.get('feature', {})
-            })
-    # Sort all features by activation value and get top 5
-    top_features = sorted(all_features, key=itemgetter('activation'), reverse=True)[:5]
     # Format output
     output = "# Neural Feature Analysis\n\n"
     output += "## Top 5 Most Active Features\n\n"
     for idx, feat in enumerate(top_features, 1):
-        feature_url = f"https://www.neuronpedia.org/gemma-2-2b/0-gemmascope-mlp-16k/{feat['feature_id']}"
-        # Try to get feature name/description if available
         feature_info = ""
         if 'name' in feat['feature_data']:
             feature_info = f" - {feat['feature_data']['name']}"
@@ -61,12 +105,13 @@ def format_output(data: Dict) -> Tuple[str, str, str]:
         output += f"### {idx}. Feature {feat['feature_id']}{feature_info}\n"
         output += f"- **Token:** '{feat['token']}'\n"
         output += f"- **Activation:** {feat['activation']:.2f}\n"
         output += f"- [View on Neuronpedia]({feature_url})\n\n"
     # Use highest activation feature for dashboard
     if top_features:
         top_feature = top_features[0]
-        dashboard_url = f"https://www.neuronpedia.org/gemma-2-2b/0-gemmascope-mlp-16k/{top_feature['feature_id']}?embed=true&embedexplanation=true&embedplots=true&embedtest=true&height=300"
         iframe = f'''
             <div style="border:1px solid #eee;border-radius:8px;padding:1px;background:#fff;">
                 <iframe
@@ -99,9 +144,9 @@ def create_interface():
                 )
                 analyze_btn = gr.Button("Analyze Features", variant="primary")
                 gr.Examples([
                     "Nike - Just Do It. The power of determination.",
                     "Apple - Think Different. Innovation redefined.",
-                    "McDonald's - I'm Lovin' It. Creating joy.",
                 ], inputs=input_text)
             with gr.Column():
@@ -110,7 +155,7 @@ def create_interface():
                 dashboard = gr.HTML()
         analyze_btn.click(
-            fn=lambda text: format_output(get_top_features(text)),
             inputs=input_text,
             outputs=[output_text, dashboard, feature_label]
         )

 import requests
 from typing import Dict, Tuple, List
 from operator import itemgetter
+from collections import Counter
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def get_features(text: str, k: int = 5) -> Dict:
+    """Get neural features from the API with detailed logging."""
     url = "https://www.neuronpedia.org/api/search-with-topk"
     payload = {
         "modelId": "gemma-2-2b",
+        "layer": "20-gemmascope-res-16k",  # Updated to match website
         "sourceSet": "gemma-scope",
         "text": text,
         "k": k,
         "ignoreBos": True
     }
+    try:
+        response = requests.post(
+            url,
+            headers={"Content-Type": "application/json"},
+            json=payload
+        )
+        response.raise_for_status()
+        data = response.json()
+        # Log the raw response for analysis
+        logger.info(f"API Response: {data}")
+        # Analyze feature distribution
+        all_features = []
+        feature_counter = Counter()
+        for result in data['results']:
+            token = result['token']
+            logger.info(f"\nToken: {token}")
+            for feature in result['top_features']:
+                feature_id = feature['feature_index']
+                activation = feature['activation_value']
+                logger.info(f"Feature {feature_id}: {activation}")
+                all_features.append({
+                    'token': token,
+                    'feature_id': feature_id,
+                    'activation': activation,
+                    'feature_data': feature.get('feature', {})
+                })
+                feature_counter[feature_id] += 1
+        # Log feature frequency analysis
+        logger.info("\nFeature Frequencies:")
+        for feature_id, count in feature_counter.most_common():
+            logger.info(f"Feature {feature_id}: {count} occurrences")
+        return data, all_features, feature_counter
+    except Exception as e:
+        logger.error(f"Error in API call: {str(e)}")
+        return None, [], Counter()
+def format_output(text: str) -> Tuple[str, str, str]:
+    data, all_features, feature_counter = get_features(text)
     if not data:
         return "Error analyzing text", "", ""
+    # Sort features by frequency first, then by maximum activation within each feature
+    feature_activations = {}
+    for feature in all_features:
+        feature_id = feature['feature_id']
+        activation = feature['activation']
+        if feature_id not in feature_activations or activation > feature_activations[feature_id]['activation']:
+            feature_activations[feature_id] = feature
+    # Get top features by frequency, then sort by activation
+    most_common_features = [
+        feature_activations[feature_id]
+        for feature_id, _ in feature_counter.most_common()
+    ]
+    # Sort by activation within the most common features
+    top_features = sorted(most_common_features, key=itemgetter('activation'), reverse=True)[:5]
     # Format output
     output = "# Neural Feature Analysis\n\n"
     output += "## Top 5 Most Active Features\n\n"
     for idx, feat in enumerate(top_features, 1):
+        feature_url = f"https://www.neuronpedia.org/gemma-2-2b/20-gemmascope-res-16k/{feat['feature_id']}"
         feature_info = ""
         if 'name' in feat['feature_data']:
             feature_info = f" - {feat['feature_data']['name']}"
         output += f"### {idx}. Feature {feat['feature_id']}{feature_info}\n"
         output += f"- **Token:** '{feat['token']}'\n"
         output += f"- **Activation:** {feat['activation']:.2f}\n"
+        output += f"- **Frequency:** {feature_counter[feat['feature_id']]} occurrences\n"
         output += f"- [View on Neuronpedia]({feature_url})\n\n"
     # Use highest activation feature for dashboard
     if top_features:
         top_feature = top_features[0]
+        dashboard_url = f"https://www.neuronpedia.org/gemma-2-2b/20-gemmascope-res-16k/{top_feature['feature_id']}?embed=true&embedexplanation=true&embedplots=true&embedtest=true&height=300"
         iframe = f'''
             <div style="border:1px solid #eee;border-radius:8px;padding:1px;background:#fff;">
                 <iframe
                 )
                 analyze_btn = gr.Button("Analyze Features", variant="primary")
                 gr.Examples([
+                    "WordLift",
                     "Nike - Just Do It. The power of determination.",
                     "Apple - Think Different. Innovation redefined.",
                 ], inputs=input_text)
             with gr.Column():
                 dashboard = gr.HTML()
         analyze_btn.click(
+            fn=lambda text: format_output(text),
             inputs=input_text,
             outputs=[output_text, dashboard, feature_label]
         )