import nbformat import spacy import gradio as gr from transformers import pipeline from tokenize import tokenize from transformers import ( AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig, pipeline, SummarizationPipeline, ) import re MODEL_NAME = "sagard21/python-code-explainer" class NotebookEnhancer: def __init__(self): self.config = AutoConfig.from_pretrained(MODEL_NAME) self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True) self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) self.model.eval() self.pipeline = pipeline( "summarization", model=MODEL_NAME, config=self.config, tokenizer=self.tokenizer, ) self.nlp = spacy.load("en_core_web_sm") def generate_title(self, code): """Generate a concise title for a code cell""" # Limit input length to match model constraints max_length = len(code) // 2 print("Title Max length", max_length) truncated_code = code[:max_length] if len(code) > max_length else code max_length = len(truncated_code) // 2 title = self.pipeline(code, min_length=5, max_length=30)[0][ "summary_text" ].strip() print("Result title", title) # Format as a markdown title return f"# {title.capitalize()}" def _count_num_words(self, code): words = code.split(" ") return len(words) def generate_summary(self, code): """Generate a detailed summary for a code cell""" # result = self.pipeline([code], min_length=3, max_length=len(code // 2)) print("Code", code) result = self.pipeline(code, min_length=5, max_length=30) print(result) summary = result[0]["summary_text"].strip() summary = self._postprocess_summary(summary) print("Result summary", summary) # print(self._is_valid_sentence_nlp(summary)) # summary = result[0]["summary_text"].strip() return f"{summary}" def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode): """Add title and summary markdown cells before each code cell""" # Create a new notebook enhanced_notebook = nbformat.v4.new_notebook() enhanced_notebook.metadata = notebook.metadata print(len(notebook.cells)) # Process each cell i = 0 id = len(notebook.cells) + 1 while i < len(notebook.cells): cell = notebook.cells[i] # For code cells, add title and summary markdown cells if cell.cell_type == "code" and cell.source.strip(): # Generate summary summary = self.generate_summary(cell.source) summary_cell = nbformat.v4.new_markdown_cell(summary) summary_cell.outputs = [] summary_cell.id = id id += 1 # Generate title based on the summary cell title = self.generate_title(summary) title_cell = nbformat.v4.new_markdown_cell(title) title_cell.outputs = [] title_cell.id = id id += 1 enhanced_notebook.cells.append(title_cell) enhanced_notebook.cells.append(summary_cell) # Add the original cell cell.outputs = [] enhanced_notebook.cells.append(cell) i += 1 return enhanced_notebook def is_valid(self, words: list[str]): has_noun = False has_verb = False for word in words: if word.pos_ in ["NOUN", "PROPN", "PRON"]: has_noun = True if word.pos_ == "VERB": has_verb = True return has_noun and has_verb def _postprocess_summary(self, summary: str): doc = self.nlp(summary) sentences = list(doc.sents) # ignore the first sentence sentences = sentences[1:] # remove the trailing list enumeration postprocessed_sentences = [] for sentence in sentences: if self.is_valid(sentence): postprocessed_sentences.append(sentence.text) return " ".join(postprocessed_sentences) def process_notebook(file_path): """Process an uploaded notebook file""" enhancer = NotebookEnhancer() nb = None with open(file_path, "r", encoding="utf-8") as f: nb = nbformat.read(f, as_version=4) # Process the notebook enhanced_notebook = enhancer.enhance_notebook(nb) print(enhanced_notebook) enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4) # Save to temp file output_path = "enhanced_notebook.ipynb" with open(output_path, "w", encoding="utf-8") as f: f.write(enhanced_notebook_str) return output_path def build_gradio_interface(): """Create and launch the Gradio interface""" with gr.Blocks(title="Notebook Enhancer") as demo: gr.Markdown("# Jupyter Notebook Enhancer") gr.Markdown( """ Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell. This tool uses Hugging Face models to: 1. Generate concise titles for code cells 2. Create explanatory summaries of what the code does """ ) with gr.Row(): with gr.Column(): file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)") process_btn = gr.Button("Enhance Notebook") with gr.Column(): output = gr.File(label="Enhanced Notebook") process_btn.click(fn=process_notebook, inputs=file_input, outputs=output) return demo # This will be the entry point when running the script if __name__ == "__main__": file_input = "my_notebook.json" test = process_notebook(file_input) # demo = build_gradio_interface() # demo.launch()