# -*- coding: utf-8 -*- """ [Martinez-Gil2024] Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks, arXiv preprint arXiv:2410.05275, 2024 @author: Jorge Martinez-Gil """ import os from transformers import RobertaTokenizer, RobertaModel from sklearn.decomposition import PCA import matplotlib.pyplot as plt import numpy as np import itertools # Initialize GraphCodeBERT tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base") model = RobertaModel.from_pretrained("microsoft/graphcodebert-base") # Define the classical sorting algorithms sorting_algorithms = { "Bubble_Sort": """ def bubble_sort(arr): n = len(arr) for i in range(n): for j in range(0, n-i-1): if arr[j] > arr[j+1]: arr[j], arr[j+1] = arr[j+1], arr[j] return arr """, "Selection_Sort": """ def selection_sort(arr): for i in range(len(arr)): min_idx = i for j in range(i+1, len(arr)): if arr[j] < arr[min_idx]: min_idx = j arr[i], arr[min_idx] = arr[min_idx], arr[i] return arr """, "Insertion_Sort": """ def insertion_sort(arr): for i in range(1, len(arr)): key = arr[i] j = i-1 while j >=0 and key < arr[j]: arr[j + 1] = arr[j] j -= 1 arr[j + 1] = key return arr """, "Merge_Sort": """ def merge_sort(arr): if len(arr) > 1: mid = len(arr)//2 L = arr[:mid] R = arr[mid:] merge_sort(L) merge_sort(R) i = j = k = 0 while i < len(L) and j < len(R): if L[i] < R[j]: arr[k] = L[i] i += 1 else: arr[k] = R[j] j += 1 k += 1 while i < len(L): arr[k] = L[i] i += 1 k += 1 while j < len(R): arr[k] = R[j] j += 1 k += 1 return arr """, "Quick_Sort": """ def partition(arr, low, high): i = (low-1) pivot = arr[high] for j in range(low, high): if arr[j] <= pivot: i = i+1 arr[i], arr[j] = arr[j], arr[i] arr[i+1], arr[high] = arr[high], arr[i+1] return (i+1) def quick_sort(arr, low, high): if low < high: pi = partition(arr, low, high) quick_sort(arr, low, pi-1) quick_sort(arr, pi+1, high) return arr """ } # Function to get token embeddings for a code snippet def get_token_embeddings(code): inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding=True) outputs = model(**inputs) token_embeddings = outputs.last_hidden_state.squeeze().detach().numpy() tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze()) return token_embeddings, tokens # Directory to save images output_dir = "pca_pairwise_comparisons" os.makedirs(output_dir, exist_ok=True) # Generate all possible pairs of sorting algorithms algorithm_pairs = list(itertools.combinations(sorting_algorithms.keys(), 2)) # Loop over each pair and generate the visualizations for (algo1_name, algo2_name) in algorithm_pairs: algo1_code = sorting_algorithms[algo1_name] algo2_code = sorting_algorithms[algo2_name] # Get token embeddings for both algorithms algo1_embeddings, algo1_tokens = get_token_embeddings(algo1_code) algo2_embeddings, algo2_tokens = get_token_embeddings(algo2_code) # Combine embeddings all_embeddings = np.concatenate((algo1_embeddings, algo2_embeddings), axis=0) # Reduce dimensionality to 2D using PCA pca = PCA(n_components=2) embeddings_2d = pca.fit_transform(all_embeddings) # Plotting the token embeddings in 2D plt.figure(figsize=(10, 8), dpi=300) # Scatter plot for the first algorithm tokens plt.scatter(embeddings_2d[:len(algo1_tokens), 0], embeddings_2d[:len(algo1_tokens), 1], color='red', s=50, label=algo1_name, alpha=0.8) # Scatter plot for the second algorithm tokens plt.scatter(embeddings_2d[len(algo1_tokens):, 0], embeddings_2d[len(algo1_tokens):, 1], color='blue', s=50, label=algo2_name, alpha=0.8) # Make the visualization more professional plt.xticks([]) plt.yticks([]) plt.xlabel('') plt.ylabel('') plt.grid(False) plt.legend() # Save the figure as a high-quality PNG file output_file = os.path.join(output_dir, f"{algo1_name}_vs_{algo2_name}_tokens_2d_pca.png") plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') # Show the plot plt.close() print("All pairwise comparison images have been generated.") import gradio as gr from io import BytesIO from PIL import Image def compare_algorithms(algo1_name, algo2_name): algo1_code = sorting_algorithms[algo1_name] algo2_code = sorting_algorithms[algo2_name] # Get token embeddings algo1_embeddings, algo1_tokens = get_token_embeddings(algo1_code) algo2_embeddings, algo2_tokens = get_token_embeddings(algo2_code) # Combine and reduce all_embeddings = np.concatenate((algo1_embeddings, algo2_embeddings), axis=0) pca = PCA(n_components=2) embeddings_2d = pca.fit_transform(all_embeddings) # Plot plt.figure(figsize=(6, 5), dpi=150) plt.scatter(embeddings_2d[:len(algo1_tokens), 0], embeddings_2d[:len(algo1_tokens), 1], color='red', s=20, label=algo1_name) plt.scatter(embeddings_2d[len(algo1_tokens):, 0], embeddings_2d[len(algo1_tokens):, 1], color='blue', s=20, label=algo2_name) plt.xticks([]); plt.yticks([]); plt.grid(False); plt.legend() # Save to BytesIO buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight') plt.close() buf.seek(0) return Image.open(buf) interface = gr.Interface( fn=compare_algorithms, inputs=[ gr.Dropdown(choices=list(sorting_algorithms.keys()), label="Algorithm 1"), gr.Dropdown(choices=list(sorting_algorithms.keys()), label="Algorithm 2") ], outputs=gr.Image(type="pil", label="Token PCA Plot"), title="Code Similarity Visualization with GraphCodeBERT" ) if __name__ == "__main__": interface.launch()