archivartaunik commited on
Commit
0b17793
·
verified ·
1 Parent(s): be86d04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -40
app.py CHANGED
@@ -1,4 +1,4 @@
1
- """ SRT Generator (Belarusian Edition)
2
 
3
  """
4
  from __future__ import annotations
@@ -7,6 +7,7 @@ import json
7
  import logging
8
  import mimetypes
9
  import os
 
10
  import threading
11
  import uuid
12
  from datetime import datetime
@@ -62,6 +63,13 @@ ALLOWED_VIDEO_PREFIX = ("video/",)
62
  HISTORY = Path("transcripts"); HISTORY.mkdir(exist_ok=True)
63
  TEXT_KEYS = ("text", "text_raw") # accepted keys for transcript text
64
 
 
 
 
 
 
 
 
65
  # ---------------------------------------------------------------------------
66
  # HELPERS
67
  # ---------------------------------------------------------------------------
@@ -74,50 +82,30 @@ def _validate(path: str, mime_prefixes: tuple[str, ...]) -> None:
74
  raise ValueError("Файл занадта вялікі.")
75
  mime, _ = mimetypes.guess_type(path)
76
  if not mime or not mime.startswith(mime_prefixes):
77
- raise ValueError(f"Непадтрыманы тып файла: {mime or 'невядомы'}.")
78
 
79
 
80
  def _parse_raw_time(raw: float | int | str) -> float:
81
- """Convert various time formats → seconds (float)."""
82
  if isinstance(raw, (int, float)):
83
  return float(raw)
84
 
85
  s = str(raw).strip()
86
- if "," in s and ":" in s:
87
- s = s.replace(",", ":", 1) # HH:MM:SS,mmm → HH:MM:SS:mmm
88
- else:
89
- s = s.replace(",", ".", 1) # decimal comma
90
-
91
- parts = s.split(":")
92
 
93
- # Just seconds
94
- if len(parts) == 1:
95
- try:
96
- return float(parts[0])
97
- except ValueError:
98
- return 0.0
99
 
100
- try:
101
- nums = [float(p) for p in parts]
102
- except ValueError:
103
- return 0.0
104
 
105
- if len(nums) == 4: # HH:MM:SS:MS
106
- h, m, sec, ms = nums
107
- elif len(nums) == 3:
108
- a, b, c = nums
109
- if c > 59: # treat as MS
110
- h = 0
111
- m, sec, ms = a, b, c
112
- else: # HH:MM:SS
113
- h, m, sec, ms = a, b, c, 0
114
- elif len(nums) == 2: # SS:MS
115
- h = m = 0
116
- sec, ms = nums
117
- else:
118
- return 0.0
119
 
120
- return h * 3600 + m * 60 + sec + ms / 1000.0
121
 
122
 
123
  def _sec_to_ts(raw: float | int | str) -> str:
@@ -128,6 +116,39 @@ def _sec_to_ts(raw: float | int | str) -> str:
128
  ms_int = int(round((rem - s_int) * 1000))
129
  return f"{int(h):02d}:{int(m):02d}:{s_int:02d},{ms_int:03d}"
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # ---------------------------------------------------------------------------
132
  # GEMINI TRANSCRIPTION
133
  # ---------------------------------------------------------------------------
@@ -147,6 +168,7 @@ def _transcribe(path: str, status: Callable[[str], None]) -> str:
147
  logger.info("Gemini raw response (first 5k): %s", text[:5000])
148
  return text
149
 
 
150
  # ---------------------------------------------------------------------------
151
  # PIPELINE
152
  # ---------------------------------------------------------------------------
@@ -185,12 +207,12 @@ def transcribe_audio(path: str, status: Callable[[str], None]):
185
  status("📥 Апрацоўка транскрыпцыі …")
186
 
187
  try:
188
- segments = json.loads(raw)
189
  except json.JSONDecodeError as exc:
190
  raise ValueError("Gemini response is not valid JSON – see logs.") from exc
191
 
192
- valid: list[dict] = []
193
- for idx, seg in enumerate(segments, 1):
194
  if not {"start", "end"}.issubset(seg):
195
  logger.warning("Segment #%s missing timing – skipped", idx)
196
  continue
@@ -198,12 +220,13 @@ def transcribe_audio(path: str, status: Callable[[str], None]):
198
  if not txt:
199
  logger.warning("Segment #%s empty text – skipped", idx)
200
  continue
201
- valid.append({"start": seg["start"], "end": seg["end"], "text": txt})
202
 
203
- if not valid:
204
  raise ValueError("Gemini returned no usable segments – cannot build SRT.")
205
 
206
- return valid
 
207
 
208
 
209
  def transcripts_to_srt(segments: List[dict]) -> Tuple[str, str]:
@@ -218,6 +241,7 @@ def transcripts_to_srt(segments: List[dict]) -> Tuple[str, str]:
218
  out_path.write_text(content, "utf-8")
219
  return content, str(out_path)
220
 
 
221
  # ---------------------------------------------------------------------------
222
  # AUDIO / VIDEO HELPERS
223
  # ---------------------------------------------------------------------------
@@ -250,6 +274,7 @@ def handle_file(audio: str | None, video: str | None, status: Callable[[str], No
250
  return process_video(video, status)
251
  raise ValueError("Ні адзін файл не загружаны.")
252
 
 
253
  # ---------------------------------------------------------------------------
254
  # GRADIO UI
255
  # ---------------------------------------------------------------------------
 
1
+ """ SRT Generator (Belarusian Edition) – fixed version
2
 
3
  """
4
  from __future__ import annotations
 
7
  import logging
8
  import mimetypes
9
  import os
10
+ import re
11
  import threading
12
  import uuid
13
  from datetime import datetime
 
63
  HISTORY = Path("transcripts"); HISTORY.mkdir(exist_ok=True)
64
  TEXT_KEYS = ("text", "text_raw") # accepted keys for transcript text
65
 
66
+ # ---------------------------------------------------------------------------
67
+ # REGEXES FOR TIME PARSING
68
+ # ---------------------------------------------------------------------------
69
+ _RE_HMS_MS = re.compile(r"^(?:(\d{1,2}):)?(\d{1,2}):(\d{1,2})[.,](\d{1,3})$") # HH:MM:SS,ms
70
+ _RE_MS_MS = re.compile(r"^(\d{1,2}):(\d{1,2})[.,](\d{1,3})$") # MM:SS,ms
71
+ _RE_SECONDS = re.compile(r"^\d+(?:[.,]\d+)?$") # SS[.ms]
72
+
73
  # ---------------------------------------------------------------------------
74
  # HELPERS
75
  # ---------------------------------------------------------------------------
 
82
  raise ValueError("Файл занадта вялікі.")
83
  mime, _ = mimetypes.guess_type(path)
84
  if not mime or not mime.startswith(mime_prefixes):
85
+ raise ValueError(f"Непадтрыманы тып файла: {mime or 'невядомы' }.")
86
 
87
 
88
  def _parse_raw_time(raw: float | int | str) -> float:
89
+ """Convert supported time formats → seconds (float)."""
90
  if isinstance(raw, (int, float)):
91
  return float(raw)
92
 
93
  s = str(raw).strip()
94
+ if not s:
95
+ return 0.0
 
 
 
 
96
 
97
+ if (m := _RE_HMS_MS.match(s)):
98
+ h, m_, sec, ms = (int(x or 0) for x in m.groups())
99
+ return h * 3600 + m_ * 60 + sec + ms / 1_000
 
 
 
100
 
101
+ if (m := _RE_MS_MS.match(s)):
102
+ m_, sec, ms = (int(x) for x in m.groups())
103
+ return m_ * 60 + sec + ms / 1_000
 
104
 
105
+ if _RE_SECONDS.match(s):
106
+ return float(s.replace(",", "."))
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ raise ValueError(f"Невядомы фармат часу: {raw!r}")
109
 
110
 
111
  def _sec_to_ts(raw: float | int | str) -> str:
 
116
  ms_int = int(round((rem - s_int) * 1000))
117
  return f"{int(h):02d}:{int(m):02d}:{s_int:02d},{ms_int:03d}"
118
 
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # SANITISATION
122
+ # ---------------------------------------------------------------------------
123
+
124
+ def _sanitize_segments(raw_segments: list[dict]) -> list[dict]:
125
+ """Ensure segments are consistent: start < end, non‑overlapping."""
126
+ fixed: list[dict] = []
127
+ prev_end = 0.0
128
+
129
+ for idx, seg in enumerate(raw_segments, 1):
130
+ start = _parse_raw_time(seg["start"])
131
+ end = _parse_raw_time(seg["end"])
132
+ text = seg["text"]
133
+
134
+ # Swap if necessary
135
+ if end < start:
136
+ logger.warning("Segment %s: end < start – swapping", idx)
137
+ start, end = end, start
138
+
139
+ # Shift if overlap
140
+ if start < prev_end:
141
+ logger.warning("Segment %s: overlap – shifting", idx)
142
+ start = prev_end + 0.001
143
+ if end <= start:
144
+ end = start + 1.0
145
+
146
+ fixed.append({"start": start, "end": end, "text": text})
147
+ prev_end = end
148
+
149
+ return fixed
150
+
151
+
152
  # ---------------------------------------------------------------------------
153
  # GEMINI TRANSCRIPTION
154
  # ---------------------------------------------------------------------------
 
168
  logger.info("Gemini raw response (first 5k): %s", text[:5000])
169
  return text
170
 
171
+
172
  # ---------------------------------------------------------------------------
173
  # PIPELINE
174
  # ---------------------------------------------------------------------------
 
207
  status("📥 Апрацоўка транскрыпцыі …")
208
 
209
  try:
210
+ segments_json = json.loads(raw)
211
  except json.JSONDecodeError as exc:
212
  raise ValueError("Gemini response is not valid JSON – see logs.") from exc
213
 
214
+ raw_segments: list[dict] = []
215
+ for idx, seg in enumerate(segments_json, 1):
216
  if not {"start", "end"}.issubset(seg):
217
  logger.warning("Segment #%s missing timing – skipped", idx)
218
  continue
 
220
  if not txt:
221
  logger.warning("Segment #%s empty text – skipped", idx)
222
  continue
223
+ raw_segments.append({"start": seg["start"], "end": seg["end"], "text": txt})
224
 
225
+ if not raw_segments:
226
  raise ValueError("Gemini returned no usable segments – cannot build SRT.")
227
 
228
+ # --- NEW: sanitise timings ---
229
+ return _sanitize_segments(raw_segments)
230
 
231
 
232
  def transcripts_to_srt(segments: List[dict]) -> Tuple[str, str]:
 
241
  out_path.write_text(content, "utf-8")
242
  return content, str(out_path)
243
 
244
+
245
  # ---------------------------------------------------------------------------
246
  # AUDIO / VIDEO HELPERS
247
  # ---------------------------------------------------------------------------
 
274
  return process_video(video, status)
275
  raise ValueError("Ні адзін файл не загружаны.")
276
 
277
+
278
  # ---------------------------------------------------------------------------
279
  # GRADIO UI
280
  # ---------------------------------------------------------------------------