Spaces:

TheFrenchDemos
/

wm-detection

Sleeping

App Files Files Community

TheFrenchDemos commited on Feb 9

Commit

f747801

1 Parent(s): 97bee26

implemented core generation + detection

Browse files

Files changed (37) hide show

.gitattributes +2 -0
Dockerfile +2 -2
run.py +2 -2
wm_interactive/core/detector.py +2 -2
wm_interactive/core/generator.py +6 -2
wm_interactive/core/hashing.py +1 -1
wm_interactive/core/main.py +11 -14
wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/0ad5ecc2035b7031b88afb544ee95e2d49baa484.lock +0 -0
wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/36293b6099200eb8aeb55ae2c01bca2ba46d80d0.lock +0 -0
wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/44719d2e365acac0637fd25a3acf46494ca45940.lock +0 -0
wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/5af571cbf074e6d21a03528d2330792e532ca608f24ac70a143f6b369968ab8c.lock +0 -0
wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/69503b13f727ba3812b6803e97442a6de05ef5eb.lock +0 -0
wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/8c7b22013909450429303ed10be4398bd63f5457.lock +0 -0
wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/da6c4d71a43aa7e6f785bdbb28ea5025438a73fa.lock +0 -0
wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/f922b1797f0c88e71addc8393787831f2477a4bd.lock +0 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/.no_exist/e2c3f7557efbdec707ae3a336371d169783f1da1/added_tokens.json +0 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/0ad5ecc2035b7031b88afb544ee95e2d49baa484 +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/36293b6099200eb8aeb55ae2c01bca2ba46d80d0 +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/44719d2e365acac0637fd25a3acf46494ca45940 +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/5af571cbf074e6d21a03528d2330792e532ca608f24ac70a143f6b369968ab8c +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/69503b13f727ba3812b6803e97442a6de05ef5eb +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/8c7b22013909450429303ed10be4398bd63f5457 +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/da6c4d71a43aa7e6f785bdbb28ea5025438a73fa +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/f922b1797f0c88e71addc8393787831f2477a4bd +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/refs/main +3 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/config.json +1 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/generation_config.json +1 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/merges.txt +1 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/model.safetensors +1 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/special_tokens_map.json +1 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/tokenizer.json +1 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/tokenizer_config.json +1 -0
wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/vocab.json +1 -0
wm_interactive/static/styles.css +14 -0
wm_interactive/templates/index.html +132 -2
wm_interactive/web/app.py +110 -12
wm_interactive/web/utils.py +19 -1

.gitattributes CHANGED Viewed

@@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 static/ia_gen_droits_auteur.pdf filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 static/ia_gen_droits_auteur.pdf filter=lfs diff=lfs merge=lfs -text
+wm_interactive/static/hf_cache/** filter=lfs diff=lfs merge=lfs -text
+wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/5af571cbf074e6d21a03528d2330792e532ca608f24ac70a143f6b369968ab8c filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -7,11 +7,11 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the rest of the application
-COPY wm_detector/ ./wm_detector/
 COPY run.py .
 # Create necessary directories
-RUN mkdir -p wm_detector/static/hf_cache
 # Set environment variables
 ENV PYTHONPATH=/app

 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the rest of the application
+COPY wm_interactive/ ./wm_interactive/
 COPY run.py .
 # Create necessary directories
+RUN mkdir -p wm_interactive/static/hf_cache
 # Set environment variables
 ENV PYTHONPATH=/app

run.py CHANGED Viewed

@@ -2,8 +2,8 @@
 Main entry point for the watermark detection application.
 Run with: python run.py
-docker build -t wm-detector .
-docker run -p 7860:7860 wm-detector
 """
 from wm_interactive.web.app import app

 Main entry point for the watermark detection application.
 Run with: python run.py
+docker build -t wm-interactive .
+docker run -p 7860:7860 wm-interactive
 """
 from wm_interactive.web.app import app

wm_interactive/core/detector.py CHANGED Viewed

@@ -159,7 +159,7 @@ class MarylandDetector(WmDetector):
             tokenizer: AutoTokenizer,
             ngram: int = 1,
             seed: int = 0,
-            gamma: float = 0.25,
             delta: float = 1.0,
             **kwargs):
         super().__init__(tokenizer, ngram, seed, **kwargs)
@@ -194,7 +194,7 @@ class MarylandDetectorZ(WmDetector):
             tokenizer: AutoTokenizer,
             ngram: int = 1,
             seed: int = 0,
-            gamma: float = 0.25,
             delta: float = 1.0,
             **kwargs):
         super().__init__(tokenizer, ngram, seed, **kwargs)

             tokenizer: AutoTokenizer,
             ngram: int = 1,
             seed: int = 0,
+            gamma: float = 0.5,
             delta: float = 1.0,
             **kwargs):
         super().__init__(tokenizer, ngram, seed, **kwargs)
             tokenizer: AutoTokenizer,
             ngram: int = 1,
             seed: int = 0,
+            gamma: float = 0.5,
             delta: float = 1.0,
             **kwargs):
         super().__init__(tokenizer, ngram, seed, **kwargs)

wm_interactive/core/generator.py CHANGED Viewed

@@ -59,13 +59,17 @@ class WmGenerator():
             next_tok = self.sample_next(outputs.logits[:, -1, :], aux, temperature, top_p)
             tokens[0, cur_pos] = torch.where(input_text_mask[0, cur_pos], tokens[0, cur_pos], next_tok)
             prev_pos = cur_pos
         # cut to max gen len
         t = tokens[0, :prompt_size + max_gen_len].tolist()
         # cut to eos tok if any
         finish_reason = 'length'
         try:
-            t = t[: t.index(self.eos_id)]
             finish_reason = 'eos'
         except ValueError:
             pass
@@ -158,7 +162,7 @@ class MarylandGenerator(WmGenerator):
     """
     def __init__(self,
             *args,
-            gamma: float = 0.25,
             delta: float = 1.0,
             test_mul: float = 0,
             **kwargs

             next_tok = self.sample_next(outputs.logits[:, -1, :], aux, temperature, top_p)
             tokens[0, cur_pos] = torch.where(input_text_mask[0, cur_pos], tokens[0, cur_pos], next_tok)
             prev_pos = cur_pos
+            if next_tok == self.eos_id:
+                break
         # cut to max gen len
         t = tokens[0, :prompt_size + max_gen_len].tolist()
         # cut to eos tok if any
         finish_reason = 'length'
         try:
+            find_eos = t[prompt_size:].index(self.eos_id)
+            if find_eos:
+                t = t[: prompt_size+find_eos]
             finish_reason = 'eos'
         except ValueError:
             pass
     """
     def __init__(self,
             *args,
+            gamma: float = 0.5,
             delta: float = 1.0,
             test_mul: float = 0,
             **kwargs

wm_interactive/core/hashing.py CHANGED Viewed

@@ -10,4 +10,4 @@ def get_seed_rng(
     """
     for ii in input_ids:
         start = (start * salt + ii) % (2 ** 64 - 1)
-    return start

     """
     for ii in input_ids:
         start = (start * salt + ii) % (2 ** 64 - 1)
+    return int(start)

wm_interactive/core/main.py CHANGED Viewed

@@ -28,12 +28,12 @@ model_names = {
 CACHE_DIR = "wm_interactive/static/hf_cache"
-def load_prompts(json_path: str, prompt_type: str = "alpaca", nsamples: int = None) -> list[dict]:
     """Load prompts from a JSON file.
     Args:
         json_path: Path to the JSON file
-        prompt_type: Type of prompt dataset (alpaca)
         nsamples: Number of samples to load (if None, load all)
     Returns:
@@ -46,10 +46,13 @@ def load_prompts(json_path: str, prompt_type: str = "alpaca", nsamples: int = No
         data = json.load(f)
     if prompt_type == "alpaca":
-        prompts = [{"instruction": f"{item["instruction"]}"} for item in data]
-            # prompt = "<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n"
-            # prompt += f"<|im_start|>user\n{item['instruction']}<|im_end|>\n"
-            # prompts.append({"instruction": prompt})
     else:
         raise ValueError(f"Prompt type {prompt_type} not supported")
@@ -93,7 +96,7 @@ def get_args_parser():
     # prompts parameters
     parser.add_argument('--prompt_path', type=str, default=None,
                        help='Path to the prompt dataset. Required if --prompt is not provided')
-    parser.add_argument('--prompt_type', type=str, default="alpaca",
                        help='Type of prompt dataset. Only used if --prompt_path is provided')
     parser.add_argument('--prompt', type=str, nargs='+', default=None,
                        help='List of prompts to use. If not provided, prompts will be loaded from --prompt_path')
@@ -148,17 +151,11 @@ def main(args):
     # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"Using device: {device}")
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        device_map=device,
-        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
         cache_dir=CACHE_DIR
-    )
-    model = model.eval()
-    for param in model.parameters():
-        param.requires_grad = False
     # build watermark generator
     if args.method == "none":

 CACHE_DIR = "wm_interactive/static/hf_cache"
+def load_prompts(json_path: str, prompt_type: str = "smollm", nsamples: int = None) -> list[dict]:
     """Load prompts from a JSON file.
     Args:
         json_path: Path to the JSON file
+        prompt_type: Type of prompt dataset (alpaca, smollm)
         nsamples: Number of samples to load (if None, load all)
     Returns:
         data = json.load(f)
     if prompt_type == "alpaca":
+        prompts = [{"instruction": item["instruction"]} for item in data]
+    elif prompt_type == "smollm":
+        prompts = []
+        for item in data:
+            prompt = "<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n"
+            prompt += f"<|im_start|>user\n{item['instruction']}<|im_end|>\n<|im_start|>assistant\n"
+            prompts.append({"instruction": prompt})
     else:
         raise ValueError(f"Prompt type {prompt_type} not supported")
     # prompts parameters
     parser.add_argument('--prompt_path', type=str, default=None,
                        help='Path to the prompt dataset. Required if --prompt is not provided')
+    parser.add_argument('--prompt_type', type=str, default="smollm",
                        help='Type of prompt dataset. Only used if --prompt_path is provided')
     parser.add_argument('--prompt', type=str, nargs='+', default=None,
                        help='List of prompts to use. If not provided, prompts will be loaded from --prompt_path')
     # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         cache_dir=CACHE_DIR
+    ).to(device)
     # build watermark generator
     if args.method == "none":

wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/0ad5ecc2035b7031b88afb544ee95e2d49baa484.lock ADDED Viewed

File without changes

wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/36293b6099200eb8aeb55ae2c01bca2ba46d80d0.lock ADDED Viewed

File without changes

wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/44719d2e365acac0637fd25a3acf46494ca45940.lock ADDED Viewed

File without changes

wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/5af571cbf074e6d21a03528d2330792e532ca608f24ac70a143f6b369968ab8c.lock ADDED Viewed

File without changes

wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/69503b13f727ba3812b6803e97442a6de05ef5eb.lock ADDED Viewed

File without changes

wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/8c7b22013909450429303ed10be4398bd63f5457.lock ADDED Viewed

File without changes

wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/da6c4d71a43aa7e6f785bdbb28ea5025438a73fa.lock ADDED Viewed

File without changes

wm_interactive/static/hf_cache/.locks/models--HuggingFaceTB--SmolLM2-135M-Instruct/f922b1797f0c88e71addc8393787831f2477a4bd.lock ADDED Viewed

File without changes

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/.no_exist/e2c3f7557efbdec707ae3a336371d169783f1da1/added_tokens.json ADDED Viewed

File without changes

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/0ad5ecc2035b7031b88afb544ee95e2d49baa484 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82b84012e3add4d01d12ba14442026e49b8cbbaead1f79ecf3d919784f82dc79
+size 800662

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/36293b6099200eb8aeb55ae2c01bca2ba46d80d0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8eb740e8bbe4cff95ea7b4588d17a2432deb16e8075bc5828ff7ba9be94d982a
+size 861

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/44719d2e365acac0637fd25a3acf46494ca45940 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b7379f3ae813529281a5c602bc5a11c1d4e0a99107aaa597fe936c1e813ca52
+size 655

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/5af571cbf074e6d21a03528d2330792e532ca608f24ac70a143f6b369968ab8c ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5af571cbf074e6d21a03528d2330792e532ca608f24ac70a143f6b369968ab8c
+size 269060552

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/69503b13f727ba3812b6803e97442a6de05ef5eb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b54e8aa4e53d5383e2e4bc635a56b43f9647f7b13832d5d9ecd8f82dac4f510
+size 466391

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/8c7b22013909450429303ed10be4398bd63f5457 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ec77d44f62efeb38d7e044a1db318f6a939438425312dfa333b8382dbad98df
+size 3764

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/da6c4d71a43aa7e6f785bdbb28ea5025438a73fa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87b916edaaab66b3899b9d0dd0752727dff6666686da0504d89ae0a6e055a013
+size 132

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/f922b1797f0c88e71addc8393787831f2477a4bd ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ca9acddb6525a194ec8ac7a87f24fbba7232a9a15ffa1af0c1224fcd888e47c
+size 2104556

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/refs/main ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71a184f20b0fe5c1a9407ed75fa9633b681779c7f1a5ca478f22fdff69a6c7ab
+size 40

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/36293b6099200eb8aeb55ae2c01bca2ba46d80d0

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/generation_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/da6c4d71a43aa7e6f785bdbb28ea5025438a73fa

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/merges.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/69503b13f727ba3812b6803e97442a6de05ef5eb

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/model.safetensors ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/5af571cbf074e6d21a03528d2330792e532ca608f24ac70a143f6b369968ab8c

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/44719d2e365acac0637fd25a3acf46494ca45940

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/tokenizer.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/f922b1797f0c88e71addc8393787831f2477a4bd

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/8c7b22013909450429303ed10be4398bd63f5457

wm_interactive/static/hf_cache/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/e2c3f7557efbdec707ae3a336371d169783f1da1/vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/0ad5ecc2035b7031b88afb544ee95e2d49baa484

wm_interactive/static/styles.css CHANGED Viewed

@@ -29,9 +29,23 @@ h1 {
     resize: none;
     font-size: 14px;
     line-height: 1.5;
     height: 200px;
 }
 .token-display {
     margin: 20px 0;
     padding: 10px;

     resize: none;
     font-size: 14px;
     line-height: 1.5;
+    margin-bottom: 10px;
+}
+.input-section #prompt_text {
+    height: 100px;
+}
+.input-section #user_text {
     height: 200px;
 }
+.button-container {
+    display: flex;
+    gap: 10px;
+    margin-bottom: 10px;
+}
 .token-display {
     margin: 20px 0;
     padding: 10px;

wm_interactive/templates/index.html CHANGED Viewed

@@ -56,8 +56,14 @@
         <!-- Input Form -->
         <div class="input-section">
             <textarea id="user_text"
-                placeholder="Replace this text in the input field to see how watermark detection works."></textarea>
         </div>
         <!-- Token Display -->
@@ -87,7 +93,11 @@
     <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
     <script>
         let debounceTimeout = null;
         const textarea = document.getElementById('user_text');
         const tokenDisplay = document.getElementById('tokenDisplay');
         const tokenCount = document.getElementById('tokenCount');
         const scoredTokens = document.getElementById('scoredTokens');
@@ -98,6 +108,122 @@
         const ngramInput = document.getElementById('ngram');
         const detectorTypeSelect = document.getElementById('detectorType');
         async function updateTokenization() {
             const text = textarea.value;
             try {
@@ -210,7 +336,11 @@
         document.addEventListener('keydown', function(e) {
             if ((e.metaKey || e.ctrlKey) && e.key === 'Enter') {
                 e.preventDefault();
-                applyParamsBtn.click();
             }
         });

         <!-- Input Form -->
         <div class="input-section">
+            <textarea id="prompt_text"
+                placeholder="Enter your prompt here to generate text with the model..."></textarea>
+            <div class="button-container">
+                <button class="btn btn-primary" id="generateBtn">Generate</button>
+                <button class="btn btn-secondary" id="stopBtn" disabled>Stop</button>
+            </div>
             <textarea id="user_text"
+                placeholder="Generated text will appear here. Replace or edit this text to see how watermark detection works."></textarea>
         </div>
         <!-- Token Display -->
     <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
     <script>
         let debounceTimeout = null;
+        let eventSource = null;
         const textarea = document.getElementById('user_text');
+        const promptArea = document.getElementById('prompt_text');
+        const generateBtn = document.getElementById('generateBtn');
+        const stopBtn = document.getElementById('stopBtn');
         const tokenDisplay = document.getElementById('tokenDisplay');
         const tokenCount = document.getElementById('tokenCount');
         const scoredTokens = document.getElementById('scoredTokens');
         const ngramInput = document.getElementById('ngram');
         const detectorTypeSelect = document.getElementById('detectorType');
+        function startGeneration() {
+            const prompt = promptArea.value.trim();
+            if (!prompt) {
+                alert('Please enter a prompt first.');
+                return;
+            }
+            generateBtn.disabled = true;
+            stopBtn.disabled = false;
+            textarea.value = '';
+            // Get current parameters
+            const params = {
+                detector_type: detectorTypeSelect.value,
+                seed: parseInt(seedInput.value) || 0,
+                ngram: parseInt(ngramInput.value) || 1
+            };
+            // Create headers for SSE
+            const headers = new Headers({
+                'Content-Type': 'application/json',
+                'Accept': 'text/event-stream',
+            });
+            // Start fetch request
+            fetch('/generate', {
+                method: 'POST',
+                headers: headers,
+                body: JSON.stringify({
+                    prompt: prompt,
+                    params: params
+                })
+            }).then(response => {
+                const reader = response.body.getReader();
+                const decoder = new TextDecoder();
+                let buffer = '';
+                function processText(text) {
+                    const lines = text.split('\n');
+                    for (const line of lines) {
+                        if (line.startsWith('data: ')) {
+                            try {
+                                const data = JSON.parse(line.slice(6));
+                                if (data.error) {
+                                    alert('Error: ' + data.error);
+                                    stopGeneration();
+                                    return;
+                                }
+                                if (data.token) {
+                                    // Append new token to existing text
+                                    textarea.value += data.token;
+                                    updateTokenization();
+                                }
+                                if (data.text) {
+                                    // Final text (only used if something went wrong with streaming)
+                                    textarea.value = data.text;
+                                    updateTokenization();
+                                }
+                                if (data.done) {
+                                    stopGeneration();
+                                }
+                            } catch (e) {
+                                console.error('Error parsing SSE data:', e);
+                            }
+                        }
+                    }
+                }
+                function pump() {
+                    return reader.read().then(({value, done}) => {
+                        if (done) {
+                            if (buffer.length > 0) {
+                                processText(buffer);
+                            }
+                            return;
+                        }
+                        buffer += decoder.decode(value, {stream: true});
+                        const lines = buffer.split('\n\n');
+                        buffer = lines.pop();
+                        for (const line of lines) {
+                            processText(line);
+                        }
+                        return pump();
+                    });
+                }
+                return pump();
+            })
+            .catch(error => {
+                console.error('Error:', error);
+                alert('Error: Failed to generate text');
+            })
+            .finally(() => {
+                generateBtn.disabled = false;
+                stopBtn.disabled = true;
+            });
+        }
+        function stopGeneration() {
+            generateBtn.disabled = false;
+            stopBtn.disabled = true;
+        }
+        // Add event listeners for generation buttons
+        generateBtn.addEventListener('click', startGeneration);
+        stopBtn.addEventListener('click', stopGeneration);
+        // Rest of the existing JavaScript code...
         async function updateTokenization() {
             const text = textarea.value;
             try {
         document.addEventListener('keydown', function(e) {
             if ((e.metaKey || e.ctrlKey) && e.key === 'Enter') {
                 e.preventDefault();
+                if (document.activeElement === promptArea) {
+                    generateBtn.click();
+                } else {
+                    applyParamsBtn.click();
+                }
             }
         });

wm_interactive/web/app.py CHANGED Viewed

@@ -2,11 +2,14 @@
 Main Flask application for the watermark detection web interface.
 """
-from flask import Flask, render_template, request, jsonify
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from ..core.detector import MarylandDetector, MarylandDetectorZ, OpenaiDetector, OpenaiDetectorZ
-from .utils import get_token_details
 CACHE_DIR = "wm_interactive/static/hf_cache"
@@ -21,6 +24,12 @@ def convert_nan_to_null(obj):
         return [convert_nan_to_null(item) for item in obj]
     return obj
 def create_detector(detector_type, tokenizer, **kwargs):
     """Create a detector instance based on the specified type."""
     detector_map = {
@@ -32,16 +41,9 @@ def create_detector(detector_type, tokenizer, **kwargs):
     # Validate and set default values for parameters
     if 'seed' in kwargs:
-        try:
-            kwargs['seed'] = int(kwargs['seed'])
-        except (ValueError, TypeError):
-            kwargs['seed'] = 0
     if 'ngram' in kwargs:
-        try:
-            kwargs['ngram'] = int(kwargs['ngram'])
-        except (ValueError, TypeError):
-            kwargs['ngram'] = 1
     detector_class = detector_map.get(detector_type, MarylandDetector)
     return detector_class(tokenizer=tokenizer, **kwargs)
@@ -58,7 +60,10 @@ def create_app():
     # model_id = "meta-llama/Llama-3.2-1B-Instruct"
     model_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
-    model = AutoModelForCausalLM.from_pretrained(model_id)
     @app.route("/", methods=["GET"])
     def index():
@@ -132,6 +137,99 @@ def create_app():
             app.logger.error(f'Server error: {str(e)}')
             return jsonify({'error': f'Server error: {str(e)}'}), 500
     return app
 app = create_app()

 Main Flask application for the watermark detection web interface.
 """
+from flask import Flask, render_template, request, jsonify, Response, stream_with_context
 from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import json
 from ..core.detector import MarylandDetector, MarylandDetectorZ, OpenaiDetector, OpenaiDetectorZ
+from ..core.generator import WmGenerator, OpenaiGenerator, MarylandGenerator
+from .utils import get_token_details, template_prompt
 CACHE_DIR = "wm_interactive/static/hf_cache"
         return [convert_nan_to_null(item) for item in obj]
     return obj
+def set_to_int(value, default_value = None):
+    try:
+        return int(value)
+    except (ValueError, TypeError):
+        return default_value
 def create_detector(detector_type, tokenizer, **kwargs):
     """Create a detector instance based on the specified type."""
     detector_map = {
     # Validate and set default values for parameters
     if 'seed' in kwargs:
+        kwargs['seed'] = set_to_int(kwargs['seed'], default_value = 0)
     if 'ngram' in kwargs:
+        kwargs['ngram'] = set_to_int(kwargs['ngram'], default_value = 1)
     detector_class = detector_map.get(detector_type, MarylandDetector)
     return detector_class(tokenizer=tokenizer, **kwargs)
     # model_id = "meta-llama/Llama-3.2-1B-Instruct"
     model_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
+    model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=CACHE_DIR).to("cuda" if torch.cuda.is_available() else "cpu")
+    # Create default generator
+    generator = MarylandGenerator(model, tokenizer, ngram=1, seed=0)
     @app.route("/", methods=["GET"])
     def index():
             app.logger.error(f'Server error: {str(e)}')
             return jsonify({'error': f'Server error: {str(e)}'}), 500
+    @app.route("/generate", methods=["POST"])
+    def generate():
+        try:
+            data = request.get_json()
+            if not data:
+                return jsonify({'error': 'No JSON data received'}), 400
+            prompt = template_prompt(data.get('prompt', ''))
+            params = data.get('params', {})
+            def generate_stream():
+                try:
+                    # Create generator with correct parameters
+                    generator_class = OpenaiGenerator if params.get('detector_type') == 'openai' else MarylandGenerator
+                    generator = generator_class(
+                        model=model,
+                        tokenizer=tokenizer,
+                        ngram=set_to_int(params.get('ngram', 1)),
+                        seed=set_to_int(params.get('seed', 0))
+                    )
+                    # Get special tokens to filter out
+                    special_tokens = {
+                        '<|im_start|>', '<|im_end|>',
+                        tokenizer.pad_token, tokenizer.eos_token,
+                        tokenizer.bos_token if hasattr(tokenizer, 'bos_token') else None,
+                        tokenizer.sep_token if hasattr(tokenizer, 'sep_token') else None
+                    }
+                    special_tokens = {t for t in special_tokens if t is not None}
+                    # Encode prompt
+                    prompt_tokens = tokenizer.encode(prompt)
+                    prompt_size = len(prompt_tokens)
+                    max_gen_len = 100
+                    total_len = min(getattr(model.config, 'max_position_embeddings', 2048), max_gen_len + prompt_size)
+                    # Initialize generation
+                    tokens = torch.full((1, total_len), model.config.pad_token_id).to(model.device).long()
+                    tokens[0, :prompt_size] = torch.tensor(prompt_tokens).long()
+                    input_text_mask = tokens != model.config.pad_token_id
+                    # Generate token by token
+                    prev_pos = 0
+                    outputs = None  # Initialize outputs to None
+                    for cur_pos in range(prompt_size, total_len):
+                        # Get model outputs
+                        outputs = model.forward(
+                            tokens[:, prev_pos:cur_pos],
+                            use_cache=True,
+                            past_key_values=outputs.past_key_values if prev_pos > 0 else None
+                        )
+                        # Sample next token using the generator's sampling method
+                        aux = {
+                            'ngram_tokens': tokens[:, cur_pos-generator.ngram:cur_pos],
+                            'cur_pos': cur_pos,
+                        }
+                        next_token = generator.sample_next(
+                            outputs.logits[:, -1, :],
+                            aux,
+                            temperature=0.8,
+                            top_p=0.95
+                        )
+                        # Check for EOS token
+                        if next_token == model.config.eos_token_id:
+                            break
+                        # Decode and check if it's a special token
+                        new_text = tokenizer.decode([next_token])
+                        if new_text not in special_tokens and not any(st in new_text for st in special_tokens):
+                            yield f"data: {json.dumps({'token': new_text, 'done': False})}\n\n"
+                        # Update token and position
+                        tokens[0, cur_pos] = next_token
+                        prev_pos = cur_pos
+                    # Send final complete text, filtering out special tokens
+                    final_tokens = tokens[0, prompt_size:cur_pos+1].tolist()
+                    final_text = tokenizer.decode(final_tokens)
+                    for st in special_tokens:
+                        final_text = final_text.replace(st, '')
+                    yield f"data: {json.dumps({'text': final_text, 'done': True})}\n\n"
+                except Exception as e:
+                    app.logger.error(f'Error generating text: {str(e)}')
+                    yield f"data: {json.dumps({'error': str(e)})}\n\n"
+            return Response(stream_with_context(generate_stream()), mimetype='text/event-stream')
+        except Exception as e:
+            app.logger.error(f'Server error: {str(e)}')
+            return jsonify({'error': f'Server error: {str(e)}'}), 500
     return app
 app = create_app()

wm_interactive/web/utils.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import random
 import numpy as np
@@ -63,3 +62,22 @@ def get_token_details(
     })
     return display_info

 import random
 import numpy as np
     })
     return display_info
+def template_prompt(instruction: str, prompt_type: str = "smollm") -> str:
+    """Template a prompt according to the model's format.
+    Args:
+        instruction: The raw prompt/instruction to template
+        prompt_type: Type of prompt format (smollm, alpaca)
+    Returns:
+        The formatted prompt ready for the model
+    """
+    if prompt_type == "alpaca":
+        return instruction
+    elif prompt_type == "smollm":
+        prompt = "<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n"
+        prompt += f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
+        return prompt
+    else:
+        raise ValueError(f"Prompt type {prompt_type} not supported")