import spaces import os, copy, gc, re, sys import traceback import torch import torch.nn.functional as F from datetime import datetime import gradio as gr from huggingface_hub import hf_hub_download # Force CPU mode as requested use_cuda = False device = torch.device("cpu") print(f"Using device: {device} (forced CPU mode)") # Set RWKV environment variables for CPU os.environ["RWKV_V7_ON"] = '1' os.environ["RWKV_JIT_ON"] = '1' os.environ["RWKV_CUDA_ON"] = '0' # Model parameters ctx_limit = 4000 gen_limit = 32000 title_v6 = "rwkv7-g1-0.1b-20250307-ctx4096" # Load RWKV with fallback mechanisms try: # First try importing normally from rwkv.model import RWKV from rwkv.utils import PIPELINE, PIPELINE_ARGS print("RWKV imported successfully") except Exception as e: print(f"Error importing RWKV: {e}") print("Attempting fallback import method...") # Fallback method - reinstall the package try: import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", "--force-reinstall", "rwkv"]) from rwkv.model import RWKV from rwkv.utils import PIPELINE, PIPELINE_ARGS print("RWKV imported after reinstall") except Exception as e: print(f"Failed to import RWKV after reinstall: {e}") raise # Download and initialize the model try: print(f"Downloading model {title_v6}...") model_path_v6 = hf_hub_download(repo_id="BlinkDL/rwkv7-g1", filename=f"{title_v6}.pth") print(f"Model downloaded to {model_path_v6}") # Use CPU strategy strategy = 'cpu fp32' print(f"Using strategy: {strategy}") # Initialize model with CPU strategy model_v6 = RWKV(model=model_path_v6.replace('.pth',''), strategy=strategy) pipeline_v6 = PIPELINE(model_v6, "rwkv_vocab_v20230424") args = model_v6.args print("Model initialized successfully") model_loaded = True except Exception as e: print(f"Error loading model: {e}") traceback.print_exc() model_loaded = False # Text generation parameters penalty_decay = 0.996 @spaces.GPU def generate_prompt(instruction, input=""): instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n') input = input.strip().replace('\r\n','\n').replace('\n\n','\n') if input: return f"""Instruction: {instruction}\n\nInput: {input}\n\nResponse:""" else: return f"""User: {instruction}\n\nAssistant:""" def qa_prompt(instruction): instruction = instruction.strip().replace('\r\n','\n') instruction = re.sub(r'\n+', '\n', instruction) return f"User: {instruction}\n\nAssistant:""" def evaluate( ctx, token_count=200, temperature=1.0, top_p=0.7, presencePenalty = 0.1, countPenalty = 0.1, ): if not model_loaded: yield "Error: Model failed to load. Please check logs for details." return try: args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p), alpha_frequency = countPenalty, alpha_presence = presencePenalty, token_ban = [], # ban the generation of some tokens token_stop = [0]) # stop generation whenever you see any token here ctx = ctx.strip() all_tokens = [] out_last = 0 out_str = '' occurrence = {} state = None for i in range(int(token_count)): input_ids = pipeline_v6.encode(ctx)[-ctx_limit:] if i == 0 else [token] out, state = model_v6.forward(input_ids, state) for n in occurrence: out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency) token = pipeline_v6.sample_logits(out, temperature=args.temperature, top_p=args.top_p) if token in args.token_stop: break all_tokens += [token] for xxx in occurrence: occurrence[xxx] *= penalty_decay ttt = pipeline_v6.decode([token]) www = 1 if ttt in ' \t0123456789': www = 0 if token not in occurrence: occurrence[token] = www else: occurrence[token] += www tmp = pipeline_v6.decode(all_tokens[out_last:]) if '\ufffd' not in tmp: out_str += tmp yield out_str.strip() out_last = i + 1 # Clean up to free memory del out del state gc.collect() yield out_str.strip() except Exception as e: print(f"Error during generation: {e}") traceback.print_exc() yield f"Error during generation: {str(e)}" # Example prompts examples = [ ["User: simulate SpaceX mars landing using python\n\nAssistant: \n

{title_v6}

\n

{model_status} - {device_status}

\n") with gr.Tab("=== Base Model (Raw Generation) ==="): gr.Markdown(f'This is [RWKV7 G1](https://huggingface.co/BlinkDL/rwkv7-g1) 0.1B (!!!) L12-D768 reasoning base LM - an attention-free pure RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). Supports 100+ world languages and code. Check [400+ Github RWKV projects](https://github.com/search?o=desc&p=1&q=rwkv&s=updated&type=Repositories). *** Can try examples (bottom of page) *** (can edit them). Demo limited to ctxlen {ctx_limit}.') with gr.Row(): with gr.Column(): prompt = gr.Textbox(lines=6, label="Prompt", value="User: simulate SpaceX mars landing using python\n\nAssistant: