Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| # Mostly from: https://github.com/adefossez/seewav | |
| # Original author: adefossez | |
| import math | |
| import tempfile | |
| from pathlib import Path | |
| import subprocess | |
| import cairo | |
| import numpy as np | |
| import gradio as gr | |
| from pydub import AudioSegment | |
| import re | |
| import colorsys | |
| def read_audio(audio, seek=None, duration=None): | |
| """ | |
| Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds. | |
| Returns `float[channels, samples]`. | |
| """ | |
| audio_segment = AudioSegment.from_file(audio) | |
| channels = audio_segment.channels | |
| samplerate = audio_segment.frame_rate | |
| if seek is not None: | |
| seek_ms = int(seek * 1000) | |
| audio_segment = audio_segment[seek_ms:] | |
| if duration is not None: | |
| duration_ms = int(duration * 1000) | |
| audio_segment = audio_segment[:duration_ms] | |
| samples = audio_segment.get_array_of_samples() | |
| wav = np.array(samples, dtype=np.float32) | |
| return wav.reshape(channels, -1), samplerate | |
| def sigmoid(x): | |
| return 1 / (1 + np.exp(-x)) | |
| def envelope(wav, window, stride): | |
| """ | |
| Extract the envelope of the waveform `wav` (float[samples]), using average pooling | |
| with `window` samples and the given `stride`. | |
| """ | |
| # pos = np.pad(np.maximum(wav, 0), window // 2) | |
| wav = np.pad(wav, window // 2) | |
| out = [] | |
| for off in range(0, len(wav) - window, stride): | |
| frame = wav[off : off + window] | |
| out.append(np.maximum(frame, 0).mean()) | |
| out = np.array(out) | |
| # Some form of audio compressor based on the sigmoid. | |
| out = 1.9 * (sigmoid(2.5 * out) - 0.5) | |
| return out | |
| def draw_env(envs, out, fg_colors, bg_color, size): | |
| """ | |
| Internal function, draw a single frame (two frames for stereo) using cairo and save | |
| it to the `out` file as png. envs is a list of envelopes over channels, each env | |
| is a float[bars] representing the height of the envelope to draw. Each entry will | |
| be represented by a bar. | |
| """ | |
| surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size) | |
| ctx = cairo.Context(surface) | |
| ctx.scale(*size) | |
| ctx.set_source_rgb(*bg_color) | |
| ctx.rectangle(0, 0, 1, 1) | |
| ctx.fill() | |
| K = len(envs) # Number of waves to draw (waves are stacked vertically) | |
| T = len(envs[0]) # Numbert of time steps | |
| pad_ratio = 0.1 # spacing ratio between 2 bars | |
| width = 1.0 / (T * (1 + 2 * pad_ratio)) | |
| pad = pad_ratio * width | |
| delta = 2 * pad + width | |
| ctx.set_line_width(width) | |
| for step in range(T): | |
| for i in range(K): | |
| half = 0.5 * envs[i][step] # (semi-)height of the bar | |
| half /= K # as we stack K waves vertically | |
| midrule = (1 + 2 * i) / (2 * K) # midrule of i-th wave | |
| ctx.set_source_rgb(*fg_colors[i]) | |
| ctx.move_to(pad + step * delta, midrule - half) | |
| ctx.line_to(pad + step * delta, midrule) | |
| ctx.stroke() | |
| ctx.set_source_rgba(*fg_colors[i], 0.8) | |
| ctx.move_to(pad + step * delta, midrule) | |
| ctx.line_to(pad + step * delta, midrule + 0.9 * half) | |
| ctx.stroke() | |
| surface.write_to_png(out) | |
| def interpole(x1, y1, x2, y2, x): | |
| return y1 + (y2 - y1) * (x - x1) / (x2 - x1) | |
| def visualize( | |
| progress, | |
| audio, | |
| tmp, | |
| out, | |
| seek=None, | |
| duration=None, | |
| rate=60, | |
| bars=50, | |
| speed=4, | |
| time=0.4, | |
| oversample=3, | |
| fg_color=(0.2, 0.2, 0.2), | |
| fg_color2=(0.5, 0.3, 0.6), | |
| bg_color=(1, 1, 1), | |
| size=(400, 400), | |
| stereo=False, | |
| ): | |
| """ | |
| Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final | |
| video in `out`. | |
| `seek` and `durations` gives the extract location if any. | |
| `rate` is the framerate of the output video. | |
| `bars` is the number of bars in the animation. | |
| `speed` is the base speed of transition. Depending on volume, actual speed will vary | |
| between 0.5 and 2 times it. | |
| `time` amount of audio shown at once on a frame. | |
| `oversample` higher values will lead to more frequent changes. | |
| `fg_color` is the rgb color to use for the foreground. | |
| `fg_color2` is the rgb color to use for the second wav if stereo is set. | |
| `bg_color` is the rgb color to use for the background. | |
| `size` is the `(width, height)` in pixels to generate. | |
| `stereo` is whether to create 2 waves. | |
| """ | |
| try: | |
| wav, sr = read_audio(audio, seek=seek, duration=duration) | |
| except (IOError, ValueError) as err: | |
| raise gr.Error(err) | |
| # wavs is a list of wav over channels | |
| wavs = [] | |
| if stereo: | |
| assert wav.shape[0] == 2, "stereo requires stereo audio file" | |
| wavs.append(wav[0]) | |
| wavs.append(wav[1]) | |
| else: | |
| wav = wav.mean(0) | |
| wavs.append(wav) | |
| for i, wav in enumerate(wavs): | |
| wavs[i] = wav / wav.std() | |
| window = int(sr * time / bars) | |
| stride = int(window / oversample) | |
| # envs is a list of env over channels | |
| envs = [] | |
| for wav in wavs: | |
| env = envelope(wav, window, stride) | |
| env = np.pad(env, (bars // 2, 2 * bars)) | |
| envs.append(env) | |
| duration = len(wavs[0]) / sr | |
| frames = int(rate * duration) | |
| smooth = np.hanning(bars) | |
| gr.Info("Generating the frames...") | |
| for idx in progress(range(frames)): | |
| pos = (((idx / rate)) * sr) / stride / bars | |
| off = int(pos) | |
| loc = pos - off | |
| denvs = [] | |
| for env in envs: | |
| env1 = env[off * bars : (off + 1) * bars] | |
| env2 = env[(off + 1) * bars : (off + 2) * bars] | |
| # we want loud parts to be updated faster | |
| maxvol = math.log10(1e-4 + env2.max()) * 10 | |
| speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2) | |
| w = sigmoid(speed * speedup * (loc - 0.5)) | |
| denv = (1 - w) * env1 + w * env2 | |
| denv *= smooth | |
| denvs.append(denv) | |
| draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size) | |
| gr.Info("Encoding the animation video...") | |
| subprocess.run([ | |
| "ffmpeg", "-y", "-loglevel", "panic", "-r", | |
| str(rate), "-f", "image2", "-s", f"{size[0]}x{size[1]}", "-i", "%06d.png", "-i", audio, "-c:a", "aac", "-vcodec", "libx264", "-crf", "10", "-pix_fmt", "yuv420p", | |
| out.resolve() | |
| ], check=True, cwd=tmp) | |
| return out | |
| def parse_color(colorstr): | |
| """ | |
| Given a comma separated rgb(a) colors, returns a 4-tuple of float. | |
| """ | |
| try: | |
| r, g, b = [float(i) for i in colorstr.split(",")] | |
| return r, g, b | |
| except ValueError: | |
| raise gr.Error( | |
| "Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order" | |
| ) | |
| def hex_to_rgb(color): | |
| """ | |
| Convert color codes to RGBA format. | |
| Supports: | |
| - HEX codes with # (e.g., "#FFA07A" or "#FFA07A80" for alpha) | |
| - HEX codes without # (e.g., "FFA07A" or "FFA07A80") | |
| - RGB format (e.g., "rgb(255, 160, 122)") | |
| - RGBA format (e.g., "rgba(255, 160, 122, 0.5)") | |
| - HSV format (e.g., "hsv(17, 52, 100)") | |
| Returns: | |
| (r, g, b, a): Tuple of float values (0-1) | |
| """ | |
| print(f"Received color: {color}") # Debugging line | |
| # HEX with or without # | |
| if color.startswith("#") or (len(color) in [6, 3, 8, 4] and all(c in "0123456789ABCDEFabcdef" for c in color)): | |
| color = color.lstrip("#") # Remove # if present | |
| # Support 3-digit HEX (e.g., "FA5" -> "FFAA55") | |
| if len(color) == 3: | |
| color = "".join([c * 2 for c in color]) # Expand to 6-digit HEX | |
| # Support 4-digit HEX with alpha (e.g., "FA58" -> "FFAA5588") | |
| if len(color) == 4: | |
| color = "".join([c * 2 for c in color]) # Expand to 8-digit HEX | |
| # HEX without alpha | |
| if len(color) == 6: | |
| return (int(color[0:2], 16) / 255.0, | |
| int(color[2:4], 16) / 255.0, | |
| int(color[4:6], 16) / 255.0, | |
| ) | |
| # HEX with alpha (8-digit HEX) | |
| if len(color) == 8: | |
| return (int(color[0:2], 16) / 255.0, | |
| int(color[2:4], 16) / 255.0, | |
| int(color[4:6], 16) / 255.0 | |
| ) | |
| # RGB format (rgb(r, g, b)) | |
| match_rgb = re.match(r"rgb\((\d+),\s*(\d+),\s*(\d+)\)", color) | |
| if match_rgb: | |
| r, g, b = map(int, match_rgb.groups()) | |
| return (r / 255.0, g / 255.0, b / 255.0) | |
| # RGBA format (rgba(r, g, b, a)) | |
| match_rgba = re.match(r"rgba\((\d+),\s*(\d+),\s*(\d+),\s*([\d.]+)\)", color) | |
| if match_rgba: | |
| r, g, b, a = match_rgba.groups() | |
| return (int(r) / 255.0, int(g) / 255.0, int(b) / 255.0) | |
| # HSV format (hsv(h, s, v)) | |
| match_hsv = re.match(r"hsv\((\d+),\s*(\d+),\s*(\d+)\)", color) | |
| if match_hsv: | |
| h, s, v = map(int, match_hsv.groups()) | |
| r, g, b = colorsys.hsv_to_rgb(h / 360.0, s / 100.0, v / 100.0) | |
| return (r, g, b) | |
| raise ValueError(f"Invalid color format: {color}") | |
| def do_viz( | |
| inp_aud, | |
| inp_bgcolor, | |
| inp_color1, | |
| inp_nbars, | |
| inp_vidw, | |
| inp_vidh, | |
| progress=gr.Progress(), | |
| ): | |
| with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile( | |
| suffix=".mp4", | |
| delete=False | |
| ) as out: | |
| return visualize( | |
| progress.tqdm, | |
| inp_aud, | |
| Path(tmp), | |
| Path(out.name), | |
| bars=inp_nbars, | |
| fg_color=hex_to_rgb(inp_color1), | |
| bg_color=hex_to_rgb(inp_bgcolor), | |
| size=(inp_vidw, inp_vidh), | |
| ) | |
| import gradio as gr | |
| ABOUT = """ | |
| # seewav GUI | |
| > Have an audio clip but need a video (e.g. for X/Twitter)? | |
| **Convert audio into a video!** | |
| An online graphical user interface for [seewav](https://github.com/adefossez/seewav). | |
| """ | |
| with gr.Blocks() as demo: | |
| gr.Markdown(ABOUT) | |
| with gr.Row(): | |
| with gr.Column(): | |
| inp_aud = gr.Audio(type='filepath') | |
| with gr.Group(): | |
| inp_color1 = gr.ColorPicker( | |
| label="Color", | |
| info="Color of the top waveform", | |
| value="#00237E", | |
| interactive=True, | |
| ) | |
| inp_bgcolor = gr.ColorPicker( | |
| label="Background Color", | |
| info="Color of the background", | |
| value="#000000", | |
| interactive=True, | |
| ) | |
| with gr.Accordion("Advanced Configuration", open=False): | |
| inp_nbars = gr.Slider( | |
| label="Num. Bars", | |
| value=50, | |
| interactive=True, | |
| minimum=5, | |
| maximum=1500, | |
| ) | |
| inp_vidw = gr.Slider( | |
| label="Video Width", | |
| value=400, | |
| interactive=True, | |
| minimum=100, | |
| maximum=3000, | |
| ) | |
| inp_vidh = gr.Slider( | |
| label="Video Height", | |
| value=400, | |
| interactive=True, | |
| minimum=100, | |
| maximum=3000, | |
| ) | |
| inp_go = gr.Button("Visualize", variant="primary") | |
| with gr.Column(): | |
| out_vid = gr.Video(interactive=False) | |
| inp_go.click( | |
| do_viz, | |
| inputs=[ | |
| inp_aud, | |
| inp_bgcolor, | |
| inp_color1, | |
| inp_nbars, | |
| inp_vidw, | |
| inp_vidh, | |
| ], | |
| outputs=[out_vid], | |
| ) | |
| demo.queue(api_open=False, default_concurrency_limit=20).launch(show_api=False) | 
