mingyang91 commited on
Commit
b5d8a79
·
verified ·
1 Parent(s): 386611a

feat: Implement alternative solution achieving functional goals

Browse files

This commit represents a mixed outcome. While the implementation successfully achieves the intended functionality, it diverges from the original plan. The solution, though unconventional, meets the necessary requirements and ensures operational effectiveness. Future revisions may align it more closely with the initial strategy.

Files changed (7) hide show
  1. Cargo.lock +245 -18
  2. Cargo.toml +7 -1
  3. config/dev.yaml +17 -9
  4. ggml-metal.metal +0 -0
  5. src/config.rs +70 -32
  6. src/main.rs +3 -3
  7. src/whisper.rs +208 -106
Cargo.lock CHANGED
@@ -37,6 +37,15 @@ dependencies = [
37
  "memchr",
38
  ]
39
 
 
 
 
 
 
 
 
 
 
40
  [[package]]
41
  name = "anyhow"
42
  version = "1.0.75"
@@ -62,7 +71,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
62
  dependencies = [
63
  "proc-macro2",
64
  "quote",
65
- "syn",
66
  ]
67
 
68
  [[package]]
@@ -73,7 +82,18 @@ checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9"
73
  dependencies = [
74
  "proc-macro2",
75
  "quote",
76
- "syn",
 
 
 
 
 
 
 
 
 
 
 
77
  ]
78
 
79
  [[package]]
@@ -511,6 +531,29 @@ dependencies = [
511
  "vsimd",
512
  ]
513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  [[package]]
515
  name = "bindgen"
516
  version = "0.68.1"
@@ -518,7 +561,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
518
  checksum = "726e4313eb6ec35d2730258ad4e15b547ee75d6afaa1361a922e78e59b7d8078"
519
  dependencies = [
520
  "bitflags 2.4.1",
521
- "cexpr",
522
  "clang-sys",
523
  "lazy_static",
524
  "lazycell",
@@ -529,9 +572,9 @@ dependencies = [
529
  "quote",
530
  "regex",
531
  "rustc-hash",
532
- "shlex",
533
- "syn",
534
- "which",
535
  ]
536
 
537
  [[package]]
@@ -586,13 +629,22 @@ dependencies = [
586
  "libc",
587
  ]
588
 
 
 
 
 
 
 
 
 
 
589
  [[package]]
590
  name = "cexpr"
591
  version = "0.6.0"
592
  source = "registry+https://github.com/rust-lang/crates.io-index"
593
  checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
594
  dependencies = [
595
- "nom",
596
  ]
597
 
598
  [[package]]
@@ -612,6 +664,21 @@ dependencies = [
612
  "libloading",
613
  ]
614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  [[package]]
616
  name = "cmake"
617
  version = "0.1.50"
@@ -630,7 +697,7 @@ dependencies = [
630
  "async-trait",
631
  "json5",
632
  "lazy_static",
633
- "nom",
634
  "pathdiff",
635
  "ron",
636
  "rust-ini",
@@ -722,6 +789,19 @@ version = "1.9.0"
722
  source = "registry+https://github.com/rust-lang/crates.io-index"
723
  checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
724
 
 
 
 
 
 
 
 
 
 
 
 
 
 
725
  [[package]]
726
  name = "equivalent"
727
  version = "1.0.1"
@@ -788,7 +868,7 @@ checksum = "53b153fd91e4b0147f4aced87be237c98248656bb01050b96bf3ee89220a8ddb"
788
  dependencies = [
789
  "proc-macro2",
790
  "quote",
791
- "syn",
792
  ]
793
 
794
  [[package]]
@@ -818,6 +898,15 @@ dependencies = [
818
  "slab",
819
  ]
820
 
 
 
 
 
 
 
 
 
 
821
  [[package]]
822
  name = "generic-array"
823
  version = "0.14.7"
@@ -909,6 +998,15 @@ dependencies = [
909
  "http",
910
  ]
911
 
 
 
 
 
 
 
 
 
 
912
  [[package]]
913
  name = "hermit-abi"
914
  version = "0.3.3"
@@ -939,6 +1037,12 @@ dependencies = [
939
  "windows-sys",
940
  ]
941
 
 
 
 
 
 
 
942
  [[package]]
943
  name = "http"
944
  version = "0.2.9"
@@ -973,6 +1077,12 @@ version = "1.0.3"
973
  source = "registry+https://github.com/rust-lang/crates.io-index"
974
  checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
975
 
 
 
 
 
 
 
976
  [[package]]
977
  name = "hyper"
978
  version = "0.14.27"
@@ -1078,6 +1188,16 @@ version = "0.2.150"
1078
  source = "registry+https://github.com/rust-lang/crates.io-index"
1079
  checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
1080
 
 
 
 
 
 
 
 
 
 
 
1081
  [[package]]
1082
  name = "libloading"
1083
  version = "0.7.4"
@@ -1173,6 +1293,16 @@ dependencies = [
1173
  "windows-sys",
1174
  ]
1175
 
 
 
 
 
 
 
 
 
 
 
1176
  [[package]]
1177
  name = "nom"
1178
  version = "7.1.3"
@@ -1218,7 +1348,7 @@ version = "1.16.0"
1218
  source = "registry+https://github.com/rust-lang/crates.io-index"
1219
  checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
1220
  dependencies = [
1221
- "hermit-abi",
1222
  "libc",
1223
  ]
1224
 
@@ -1337,7 +1467,7 @@ dependencies = [
1337
  "pest_meta",
1338
  "proc-macro2",
1339
  "quote",
1340
- "syn",
1341
  ]
1342
 
1343
  [[package]]
@@ -1405,7 +1535,7 @@ dependencies = [
1405
  "proc-macro-crate",
1406
  "proc-macro2",
1407
  "quote",
1408
- "syn",
1409
  ]
1410
 
1411
  [[package]]
@@ -1420,6 +1550,8 @@ dependencies = [
1420
  "aws-sdk-translate",
1421
  "config",
1422
  "futures-util",
 
 
1423
  "once_cell",
1424
  "poem",
1425
  "serde",
@@ -1429,6 +1561,7 @@ dependencies = [
1429
  "tokio-stream",
1430
  "tracing",
1431
  "tracing-subscriber",
 
1432
  "whisper-rs",
1433
  "whisper-rs-sys",
1434
  ]
@@ -1452,7 +1585,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1452
  checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d"
1453
  dependencies = [
1454
  "proc-macro2",
1455
- "syn",
1456
  ]
1457
 
1458
  [[package]]
@@ -1764,7 +1897,7 @@ checksum = "d6c7207fbec9faa48073f3e3074cbe553af6ea512d7c21ba46e434e70ea9fbc1"
1764
  dependencies = [
1765
  "proc-macro2",
1766
  "quote",
1767
- "syn",
1768
  ]
1769
 
1770
  [[package]]
@@ -1834,6 +1967,12 @@ dependencies = [
1834
  "lazy_static",
1835
  ]
1836
 
 
 
 
 
 
 
1837
  [[package]]
1838
  name = "shlex"
1839
  version = "1.2.0"
@@ -1890,12 +2029,29 @@ version = "0.9.8"
1890
  source = "registry+https://github.com/rust-lang/crates.io-index"
1891
  checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
1892
 
 
 
 
 
 
 
1893
  [[package]]
1894
  name = "subtle"
1895
  version = "2.5.0"
1896
  source = "registry+https://github.com/rust-lang/crates.io-index"
1897
  checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
1898
 
 
 
 
 
 
 
 
 
 
 
 
1899
  [[package]]
1900
  name = "syn"
1901
  version = "2.0.39"
@@ -1907,6 +2063,24 @@ dependencies = [
1907
  "unicode-ident",
1908
  ]
1909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1910
  [[package]]
1911
  name = "thiserror"
1912
  version = "1.0.50"
@@ -1924,7 +2098,7 @@ checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8"
1924
  dependencies = [
1925
  "proc-macro2",
1926
  "quote",
1927
- "syn",
1928
  ]
1929
 
1930
  [[package]]
@@ -2006,7 +2180,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
2006
  dependencies = [
2007
  "proc-macro2",
2008
  "quote",
2009
- "syn",
2010
  ]
2011
 
2012
  [[package]]
@@ -2107,7 +2281,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
2107
  dependencies = [
2108
  "proc-macro2",
2109
  "quote",
2110
- "syn",
2111
  ]
2112
 
2113
  [[package]]
@@ -2149,6 +2323,29 @@ dependencies = [
2149
  "tracing-log",
2150
  ]
2151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2152
  [[package]]
2153
  name = "try-lock"
2154
  version = "0.2.4"
@@ -2225,6 +2422,12 @@ dependencies = [
2225
  "tinyvec",
2226
  ]
2227
 
 
 
 
 
 
 
2228
  [[package]]
2229
  name = "unsafe-libyaml"
2230
  version = "0.2.9"
@@ -2272,6 +2475,12 @@ version = "0.1.0"
2272
  source = "registry+https://github.com/rust-lang/crates.io-index"
2273
  checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
2274
 
 
 
 
 
 
 
2275
  [[package]]
2276
  name = "version_check"
2277
  version = "0.9.4"
@@ -2299,6 +2508,15 @@ version = "0.11.0+wasi-snapshot-preview1"
2299
  source = "registry+https://github.com/rust-lang/crates.io-index"
2300
  checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
2301
 
 
 
 
 
 
 
 
 
 
2302
  [[package]]
2303
  name = "which"
2304
  version = "4.4.2"
@@ -2324,7 +2542,7 @@ name = "whisper-rs-sys"
2324
  version = "0.7.3"
2325
  source = "git+https://github.com/mingyang91/whisper-rs.git#f8e424a19b13cc348395afd862f0dcb864fcb1fc"
2326
  dependencies = [
2327
- "bindgen",
2328
  "cfg-if",
2329
  "cmake",
2330
  "fs_extra",
@@ -2346,6 +2564,15 @@ version = "0.4.0"
2346
  source = "registry+https://github.com/rust-lang/crates.io-index"
2347
  checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
2348
 
 
 
 
 
 
 
 
 
 
2349
  [[package]]
2350
  name = "winapi-x86_64-pc-windows-gnu"
2351
  version = "0.4.0"
 
37
  "memchr",
38
  ]
39
 
40
+ [[package]]
41
+ name = "ansi_term"
42
+ version = "0.12.1"
43
+ source = "registry+https://github.com/rust-lang/crates.io-index"
44
+ checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
45
+ dependencies = [
46
+ "winapi",
47
+ ]
48
+
49
  [[package]]
50
  name = "anyhow"
51
  version = "1.0.75"
 
71
  dependencies = [
72
  "proc-macro2",
73
  "quote",
74
+ "syn 2.0.39",
75
  ]
76
 
77
  [[package]]
 
82
  dependencies = [
83
  "proc-macro2",
84
  "quote",
85
+ "syn 2.0.39",
86
+ ]
87
+
88
+ [[package]]
89
+ name = "atty"
90
+ version = "0.2.14"
91
+ source = "registry+https://github.com/rust-lang/crates.io-index"
92
+ checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
93
+ dependencies = [
94
+ "hermit-abi 0.1.19",
95
+ "libc",
96
+ "winapi",
97
  ]
98
 
99
  [[package]]
 
531
  "vsimd",
532
  ]
533
 
534
+ [[package]]
535
+ name = "bindgen"
536
+ version = "0.56.0"
537
+ source = "registry+https://github.com/rust-lang/crates.io-index"
538
+ checksum = "2da379dbebc0b76ef63ca68d8fc6e71c0f13e59432e0987e508c1820e6ab5239"
539
+ dependencies = [
540
+ "bitflags 1.3.2",
541
+ "cexpr 0.4.0",
542
+ "clang-sys",
543
+ "clap",
544
+ "env_logger",
545
+ "lazy_static",
546
+ "lazycell",
547
+ "log",
548
+ "peeking_take_while",
549
+ "proc-macro2",
550
+ "quote",
551
+ "regex",
552
+ "rustc-hash",
553
+ "shlex 0.1.1",
554
+ "which 3.1.1",
555
+ ]
556
+
557
  [[package]]
558
  name = "bindgen"
559
  version = "0.68.1"
 
561
  checksum = "726e4313eb6ec35d2730258ad4e15b547ee75d6afaa1361a922e78e59b7d8078"
562
  dependencies = [
563
  "bitflags 2.4.1",
564
+ "cexpr 0.6.0",
565
  "clang-sys",
566
  "lazy_static",
567
  "lazycell",
 
572
  "quote",
573
  "regex",
574
  "rustc-hash",
575
+ "shlex 1.2.0",
576
+ "syn 2.0.39",
577
+ "which 4.4.2",
578
  ]
579
 
580
  [[package]]
 
629
  "libc",
630
  ]
631
 
632
+ [[package]]
633
+ name = "cexpr"
634
+ version = "0.4.0"
635
+ source = "registry+https://github.com/rust-lang/crates.io-index"
636
+ checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27"
637
+ dependencies = [
638
+ "nom 5.1.3",
639
+ ]
640
+
641
  [[package]]
642
  name = "cexpr"
643
  version = "0.6.0"
644
  source = "registry+https://github.com/rust-lang/crates.io-index"
645
  checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
646
  dependencies = [
647
+ "nom 7.1.3",
648
  ]
649
 
650
  [[package]]
 
664
  "libloading",
665
  ]
666
 
667
+ [[package]]
668
+ name = "clap"
669
+ version = "2.34.0"
670
+ source = "registry+https://github.com/rust-lang/crates.io-index"
671
+ checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
672
+ dependencies = [
673
+ "ansi_term",
674
+ "atty",
675
+ "bitflags 1.3.2",
676
+ "strsim",
677
+ "textwrap",
678
+ "unicode-width",
679
+ "vec_map",
680
+ ]
681
+
682
  [[package]]
683
  name = "cmake"
684
  version = "0.1.50"
 
697
  "async-trait",
698
  "json5",
699
  "lazy_static",
700
+ "nom 7.1.3",
701
  "pathdiff",
702
  "ron",
703
  "rust-ini",
 
789
  source = "registry+https://github.com/rust-lang/crates.io-index"
790
  checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
791
 
792
+ [[package]]
793
+ name = "env_logger"
794
+ version = "0.8.4"
795
+ source = "registry+https://github.com/rust-lang/crates.io-index"
796
+ checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3"
797
+ dependencies = [
798
+ "atty",
799
+ "humantime",
800
+ "log",
801
+ "regex",
802
+ "termcolor",
803
+ ]
804
+
805
  [[package]]
806
  name = "equivalent"
807
  version = "1.0.1"
 
868
  dependencies = [
869
  "proc-macro2",
870
  "quote",
871
+ "syn 2.0.39",
872
  ]
873
 
874
  [[package]]
 
898
  "slab",
899
  ]
900
 
901
+ [[package]]
902
+ name = "fvad"
903
+ version = "0.1.3"
904
+ source = "registry+https://github.com/rust-lang/crates.io-index"
905
+ checksum = "8b8e04cf8731da968d9456575a0ae44cb8760dee46169a5289a0e87d4cc4743a"
906
+ dependencies = [
907
+ "libfvad-sys",
908
+ ]
909
+
910
  [[package]]
911
  name = "generic-array"
912
  version = "0.14.7"
 
998
  "http",
999
  ]
1000
 
1001
+ [[package]]
1002
+ name = "hermit-abi"
1003
+ version = "0.1.19"
1004
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1005
+ checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
1006
+ dependencies = [
1007
+ "libc",
1008
+ ]
1009
+
1010
  [[package]]
1011
  name = "hermit-abi"
1012
  version = "0.3.3"
 
1037
  "windows-sys",
1038
  ]
1039
 
1040
+ [[package]]
1041
+ name = "hound"
1042
+ version = "3.5.1"
1043
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1044
+ checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
1045
+
1046
  [[package]]
1047
  name = "http"
1048
  version = "0.2.9"
 
1077
  source = "registry+https://github.com/rust-lang/crates.io-index"
1078
  checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
1079
 
1080
+ [[package]]
1081
+ name = "humantime"
1082
+ version = "2.1.0"
1083
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1084
+ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
1085
+
1086
  [[package]]
1087
  name = "hyper"
1088
  version = "0.14.27"
 
1188
  source = "registry+https://github.com/rust-lang/crates.io-index"
1189
  checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
1190
 
1191
+ [[package]]
1192
+ name = "libfvad-sys"
1193
+ version = "1.0.0"
1194
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1195
+ checksum = "473b5389760c65fab561600c78f609ee5779799ae8d29818eccea95d8a8c94d8"
1196
+ dependencies = [
1197
+ "bindgen 0.56.0",
1198
+ "cc",
1199
+ ]
1200
+
1201
  [[package]]
1202
  name = "libloading"
1203
  version = "0.7.4"
 
1293
  "windows-sys",
1294
  ]
1295
 
1296
+ [[package]]
1297
+ name = "nom"
1298
+ version = "5.1.3"
1299
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1300
+ checksum = "08959a387a676302eebf4ddbcbc611da04285579f76f88ee0506c63b1a61dd4b"
1301
+ dependencies = [
1302
+ "memchr",
1303
+ "version_check",
1304
+ ]
1305
+
1306
  [[package]]
1307
  name = "nom"
1308
  version = "7.1.3"
 
1348
  source = "registry+https://github.com/rust-lang/crates.io-index"
1349
  checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
1350
  dependencies = [
1351
+ "hermit-abi 0.3.3",
1352
  "libc",
1353
  ]
1354
 
 
1467
  "pest_meta",
1468
  "proc-macro2",
1469
  "quote",
1470
+ "syn 2.0.39",
1471
  ]
1472
 
1473
  [[package]]
 
1535
  "proc-macro-crate",
1536
  "proc-macro2",
1537
  "quote",
1538
+ "syn 2.0.39",
1539
  ]
1540
 
1541
  [[package]]
 
1550
  "aws-sdk-translate",
1551
  "config",
1552
  "futures-util",
1553
+ "fvad",
1554
+ "hound",
1555
  "once_cell",
1556
  "poem",
1557
  "serde",
 
1561
  "tokio-stream",
1562
  "tracing",
1563
  "tracing-subscriber",
1564
+ "tracing-test",
1565
  "whisper-rs",
1566
  "whisper-rs-sys",
1567
  ]
 
1585
  checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d"
1586
  dependencies = [
1587
  "proc-macro2",
1588
+ "syn 2.0.39",
1589
  ]
1590
 
1591
  [[package]]
 
1897
  dependencies = [
1898
  "proc-macro2",
1899
  "quote",
1900
+ "syn 2.0.39",
1901
  ]
1902
 
1903
  [[package]]
 
1967
  "lazy_static",
1968
  ]
1969
 
1970
+ [[package]]
1971
+ name = "shlex"
1972
+ version = "0.1.1"
1973
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1974
+ checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
1975
+
1976
  [[package]]
1977
  name = "shlex"
1978
  version = "1.2.0"
 
2029
  source = "registry+https://github.com/rust-lang/crates.io-index"
2030
  checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
2031
 
2032
+ [[package]]
2033
+ name = "strsim"
2034
+ version = "0.8.0"
2035
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2036
+ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
2037
+
2038
  [[package]]
2039
  name = "subtle"
2040
  version = "2.5.0"
2041
  source = "registry+https://github.com/rust-lang/crates.io-index"
2042
  checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
2043
 
2044
+ [[package]]
2045
+ name = "syn"
2046
+ version = "1.0.109"
2047
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2048
+ checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
2049
+ dependencies = [
2050
+ "proc-macro2",
2051
+ "quote",
2052
+ "unicode-ident",
2053
+ ]
2054
+
2055
  [[package]]
2056
  name = "syn"
2057
  version = "2.0.39"
 
2063
  "unicode-ident",
2064
  ]
2065
 
2066
+ [[package]]
2067
+ name = "termcolor"
2068
+ version = "1.4.0"
2069
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2070
+ checksum = "ff1bc3d3f05aff0403e8ac0d92ced918ec05b666a43f83297ccef5bea8a3d449"
2071
+ dependencies = [
2072
+ "winapi-util",
2073
+ ]
2074
+
2075
+ [[package]]
2076
+ name = "textwrap"
2077
+ version = "0.11.0"
2078
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2079
+ checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
2080
+ dependencies = [
2081
+ "unicode-width",
2082
+ ]
2083
+
2084
  [[package]]
2085
  name = "thiserror"
2086
  version = "1.0.50"
 
2098
  dependencies = [
2099
  "proc-macro2",
2100
  "quote",
2101
+ "syn 2.0.39",
2102
  ]
2103
 
2104
  [[package]]
 
2180
  dependencies = [
2181
  "proc-macro2",
2182
  "quote",
2183
+ "syn 2.0.39",
2184
  ]
2185
 
2186
  [[package]]
 
2281
  dependencies = [
2282
  "proc-macro2",
2283
  "quote",
2284
+ "syn 2.0.39",
2285
  ]
2286
 
2287
  [[package]]
 
2323
  "tracing-log",
2324
  ]
2325
 
2326
+ [[package]]
2327
+ name = "tracing-test"
2328
+ version = "0.2.4"
2329
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2330
+ checksum = "3a2c0ff408fe918a94c428a3f2ad04e4afd5c95bbc08fcf868eff750c15728a4"
2331
+ dependencies = [
2332
+ "lazy_static",
2333
+ "tracing-core",
2334
+ "tracing-subscriber",
2335
+ "tracing-test-macro",
2336
+ ]
2337
+
2338
+ [[package]]
2339
+ name = "tracing-test-macro"
2340
+ version = "0.2.4"
2341
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2342
+ checksum = "258bc1c4f8e2e73a977812ab339d503e6feeb92700f6d07a6de4d321522d5c08"
2343
+ dependencies = [
2344
+ "lazy_static",
2345
+ "quote",
2346
+ "syn 1.0.109",
2347
+ ]
2348
+
2349
  [[package]]
2350
  name = "try-lock"
2351
  version = "0.2.4"
 
2422
  "tinyvec",
2423
  ]
2424
 
2425
+ [[package]]
2426
+ name = "unicode-width"
2427
+ version = "0.1.11"
2428
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2429
+ checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
2430
+
2431
  [[package]]
2432
  name = "unsafe-libyaml"
2433
  version = "0.2.9"
 
2475
  source = "registry+https://github.com/rust-lang/crates.io-index"
2476
  checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
2477
 
2478
+ [[package]]
2479
+ name = "vec_map"
2480
+ version = "0.8.2"
2481
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2482
+ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
2483
+
2484
  [[package]]
2485
  name = "version_check"
2486
  version = "0.9.4"
 
2508
  source = "registry+https://github.com/rust-lang/crates.io-index"
2509
  checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
2510
 
2511
+ [[package]]
2512
+ name = "which"
2513
+ version = "3.1.1"
2514
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2515
+ checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724"
2516
+ dependencies = [
2517
+ "libc",
2518
+ ]
2519
+
2520
  [[package]]
2521
  name = "which"
2522
  version = "4.4.2"
 
2542
  version = "0.7.3"
2543
  source = "git+https://github.com/mingyang91/whisper-rs.git#f8e424a19b13cc348395afd862f0dcb864fcb1fc"
2544
  dependencies = [
2545
+ "bindgen 0.68.1",
2546
  "cfg-if",
2547
  "cmake",
2548
  "fs_extra",
 
2564
  source = "registry+https://github.com/rust-lang/crates.io-index"
2565
  checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
2566
 
2567
+ [[package]]
2568
+ name = "winapi-util"
2569
+ version = "0.1.6"
2570
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2571
+ checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
2572
+ dependencies = [
2573
+ "winapi",
2574
+ ]
2575
+
2576
  [[package]]
2577
  name = "winapi-x86_64-pc-windows-gnu"
2578
  version = "0.4.0"
Cargo.toml CHANGED
@@ -18,8 +18,9 @@ serde_json = "1.0"
18
  serde_yaml = "0.9"
19
  tokio = { version = "1.33", features = ["macros", "rt-multi-thread", "sync", "signal"] }
20
  tokio-stream = "0.1"
21
- tracing = "0.1"
22
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 
23
 
24
  [dependencies.poem]
25
  version = "1.3"
@@ -27,6 +28,11 @@ features = ["websocket", "static-files"]
27
 
28
  [dependencies.whisper-rs]
29
  git = "https://github.com/mingyang91/whisper-rs.git"
 
30
  [dependencies.whisper-rs-sys]
31
  git = "https://github.com/mingyang91/whisper-rs.git"
32
  package = "whisper-rs-sys"
 
 
 
 
 
18
  serde_yaml = "0.9"
19
  tokio = { version = "1.33", features = ["macros", "rt-multi-thread", "sync", "signal"] }
20
  tokio-stream = "0.1"
21
+ tracing = { version = "0.1", features = [] }
22
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
23
+ fvad = "0.1"
24
 
25
  [dependencies.poem]
26
  version = "1.3"
 
28
 
29
  [dependencies.whisper-rs]
30
  git = "https://github.com/mingyang91/whisper-rs.git"
31
+ features = ["coreml", "metal"]
32
  [dependencies.whisper-rs-sys]
33
  git = "https://github.com/mingyang91/whisper-rs.git"
34
  package = "whisper-rs-sys"
35
+
36
+ [dev-dependencies]
37
+ hound = "3.5.1"
38
+ tracing-test = "*"
config/dev.yaml CHANGED
@@ -2,24 +2,32 @@ server:
2
  port: 8080
3
  host: 0.0.0.0
4
  whisper:
5
- length_ms: 10000
6
  keep_ms: 200
7
  step_ms: 5000
8
- model: "models/ggml-large-q5_0.bin" #"models/ggml-base.bin"
9
- max_prompt_tokens: 128
 
 
 
10
  params:
11
- #n_threads: 4
12
- max_tokens: 0
13
- audio_ctx: 0
14
  speed_up: false
15
- single_segment: false
16
  translate: false
17
- no_fallback: false
18
- temperature_inc: -1.0
 
 
 
19
  print_special: false
20
  print_progress: false
21
  print_realtime: false
 
22
  no_context: false
23
  no_timestamps: false
 
24
  tinydiarize: false
25
  language: "en"
 
2
  port: 8080
3
  host: 0.0.0.0
4
  whisper:
5
+ length_ms: 5000
6
  keep_ms: 200
7
  step_ms: 5000
8
+ model: "models/ggml-large-v3.bin"
9
+ # model: "models/ggml-base.bin"
10
+ # model: "models/ggml-medium.en.bin"
11
+ max_prompt_tokens: 32
12
+ context_confidence_threshold: 0.5
13
  params:
14
+ # n_threads: 8
15
+ # max_tokens: 0
16
+ # audio_ctx: 0
17
  speed_up: false
18
+ single_segment: true
19
  translate: false
20
+ # temperature_inc: 0.2 #0.4
21
+ # temperature_inc: 0
22
+ # entropy_threshold: 2.5
23
+ # entropy_threshold: 2.8
24
+ # n_max_text_ctx: 64 #16384
25
  print_special: false
26
  print_progress: false
27
  print_realtime: false
28
+ token_timestamps: false
29
  no_context: false
30
  no_timestamps: false
31
+ suppress_non_speech_tokens: false
32
  tinydiarize: false
33
  language: "en"
ggml-metal.metal ADDED
The diff for this file is too large to render. See raw diff
 
src/config.rs CHANGED
@@ -3,7 +3,7 @@ use std::{env, ffi::c_int, net::IpAddr};
3
  use config::{Config, Environment, File};
4
  use once_cell::sync::Lazy;
5
  use serde::Deserialize;
6
- use whisper_rs::FullParams;
7
  use tracing::debug;
8
 
9
  pub(crate) static SETTINGS: Lazy<Settings> =
@@ -12,28 +12,33 @@ pub(crate) static SETTINGS: Lazy<Settings> =
12
  #[derive(Debug, Deserialize, Clone)]
13
  pub(crate) struct WhisperConfig {
14
  pub(crate) params: WhisperParams,
15
- pub(crate) step_ms: u32,
16
- pub(crate) length_ms: u32,
17
- pub(crate) keep_ms: u32,
18
  pub(crate) model: String,
19
  pub(crate) max_prompt_tokens: usize,
 
20
  }
21
 
22
  #[allow(dead_code)]
23
  #[derive(Debug, Deserialize, Clone)]
24
  pub(crate) struct WhisperParams {
25
  pub(crate) n_threads: Option<usize>,
26
- pub(crate) max_tokens: u32,
27
- pub(crate) audio_ctx: u32,
28
- pub(crate) speed_up: bool,
29
- pub(crate) translate: bool,
30
- pub(crate) no_fallback: bool,
31
- pub(crate) print_special: bool,
32
- pub(crate) print_realtime: bool,
33
- pub(crate) print_progress: bool,
34
- pub(crate) no_timestamps: bool,
35
- pub(crate) temperature_inc: f32,
36
- pub(crate) single_segment: bool,
 
 
 
 
37
  // pub(crate) tinydiarize: bool,
38
  pub(crate) language: Option<String>,
39
  }
@@ -41,25 +46,58 @@ pub(crate) struct WhisperParams {
41
  impl WhisperParams {
42
  pub(crate) fn to_full_params<'a, 'b>(&'a self, tokens: &'b [c_int]) -> FullParams<'a, 'b> {
43
  let mut param = FullParams::new(Default::default());
44
- param.set_print_progress(self.print_progress);
45
- param.set_print_special(self.print_special);
46
- param.set_print_realtime(self.print_realtime);
47
- param.set_print_timestamps(!self.no_timestamps);
48
- param.set_translate(self.translate);
49
- param.set_single_segment(false);
50
- param.set_max_tokens(self.max_tokens as i32);
51
- let lang = self.language.as_deref();
52
- param.set_language(lang);
53
- let num_cpus = std::thread::available_parallelism()
54
- .map(|c| c.get())
55
- .unwrap_or(4);
56
- param.set_n_threads(self.n_threads.unwrap_or(num_cpus) as c_int);
57
- param.set_audio_ctx(self.audio_ctx as i32);
58
- param.set_speed_up(self.speed_up);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  // param.set_tdrz_enable(self.tinydiarize);
60
- param.set_temperature_inc(self.temperature_inc);
61
- param.set_tokens(tokens);
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
 
63
  param
64
  }
65
  }
 
3
  use config::{Config, Environment, File};
4
  use once_cell::sync::Lazy;
5
  use serde::Deserialize;
6
+ use whisper_rs::{FullParams};
7
  use tracing::debug;
8
 
9
  pub(crate) static SETTINGS: Lazy<Settings> =
 
12
  #[derive(Debug, Deserialize, Clone)]
13
  pub(crate) struct WhisperConfig {
14
  pub(crate) params: WhisperParams,
15
+ pub(crate) step_ms: usize,
16
+ pub(crate) length_ms: usize,
17
+ pub(crate) keep_ms: usize,
18
  pub(crate) model: String,
19
  pub(crate) max_prompt_tokens: usize,
20
+ pub(crate) context_confidence_threshold: f32,
21
  }
22
 
23
  #[allow(dead_code)]
24
  #[derive(Debug, Deserialize, Clone)]
25
  pub(crate) struct WhisperParams {
26
  pub(crate) n_threads: Option<usize>,
27
+ pub(crate) max_tokens: Option<u32>,
28
+ pub(crate) audio_ctx: Option<u32>,
29
+ pub(crate) speed_up: Option<bool>,
30
+ pub(crate) translate: Option<bool>,
31
+ pub(crate) no_context: Option<bool>,
32
+ pub(crate) print_special: Option<bool>,
33
+ pub(crate) print_realtime: Option<bool>,
34
+ pub(crate) print_progress: Option<bool>,
35
+ pub(crate) token_timestamps: Option<bool>,
36
+ pub(crate) no_timestamps: Option<bool>,
37
+ pub(crate) temperature_inc: Option<f32>,
38
+ pub(crate) entropy_threshold: Option<f32>,
39
+ pub(crate) single_segment: Option<bool>,
40
+ pub(crate) suppress_non_speech_tokens: Option<bool>,
41
+ pub(crate) n_max_text_ctx: Option<usize>,
42
  // pub(crate) tinydiarize: bool,
43
  pub(crate) language: Option<String>,
44
  }
 
46
  impl WhisperParams {
47
  pub(crate) fn to_full_params<'a, 'b>(&'a self, tokens: &'b [c_int]) -> FullParams<'a, 'b> {
48
  let mut param = FullParams::new(Default::default());
49
+ if let Some(print_progress) = self.print_progress.as_ref() {
50
+ param.set_print_progress(*print_progress);
51
+ }
52
+ if let Some(print_special) = self.print_special.as_ref() {
53
+ param.set_print_special(*print_special);
54
+ }
55
+ if let Some(print_realtime) = self.print_realtime.as_ref() {
56
+ param.set_print_realtime(*print_realtime);
57
+ }
58
+ if let Some(single_segment) = self.single_segment.as_ref() {
59
+ param.set_single_segment(*single_segment);
60
+ }
61
+ if let Some(no_timestamps) = self.no_timestamps.as_ref() {
62
+ param.set_print_timestamps(!no_timestamps);
63
+ }
64
+ if let Some(token_timestamps) = self.token_timestamps.as_ref() {
65
+ param.set_token_timestamps(*token_timestamps);
66
+ }
67
+ if let Some(translate) = self.translate.as_ref() {
68
+ param.set_translate(*translate);
69
+ }
70
+ if let Some(max_tokens) = self.max_tokens.as_ref() {
71
+ param.set_max_tokens(*max_tokens as i32);
72
+ }
73
+ param.set_language(self.language.as_deref());
74
+ if let Some(n_threads) = self.n_threads.as_ref() {
75
+ param.set_n_threads(*n_threads as i32);
76
+ }
77
+ if let Some(audio_ctx) = self.audio_ctx.as_ref() {
78
+ param.set_audio_ctx(*audio_ctx as i32);
79
+ }
80
+ if let Some(speed_up) = self.speed_up.as_ref() {
81
+ param.set_speed_up(*speed_up);
82
+ }
83
  // param.set_tdrz_enable(self.tinydiarize);
84
+ if let Some(temperature_inc) = self.temperature_inc.as_ref() {
85
+ param.set_temperature_inc(*temperature_inc);
86
+ }
87
+ if let Some(suppress_non_speech_tokens) = self.suppress_non_speech_tokens.as_ref() {
88
+ param.set_suppress_non_speech_tokens(*suppress_non_speech_tokens);
89
+ }
90
+ if let Some(no_context) = self.no_context.as_ref() {
91
+ param.set_no_context(*no_context);
92
+ }
93
+ if let Some(entropy_threshold) = self.entropy_threshold.as_ref() {
94
+ param.set_entropy_thold(*entropy_threshold);
95
+ }
96
+ if let Some(n_max_text_ctx) = self.n_max_text_ctx.as_ref() {
97
+ param.set_n_max_text_ctx(*n_max_text_ctx as i32);
98
+ }
99
 
100
+ param.set_tokens(tokens);
101
  param
102
  }
103
  }
src/main.rs CHANGED
@@ -104,7 +104,7 @@ async fn stream_speaker(
104
  ws.on_upgrade(|mut socket| async move {
105
  let _origin_tx = lesson.voice_channel();
106
  let mut transcribe_rx = lesson.transcript_channel();
107
- let whisper = WhisperHandler::new(SETTINGS.whisper.clone(), prompt)
108
  .expect("failed to create whisper");
109
  let mut whisper_transcribe_rx = whisper.subscribe();
110
  loop {
@@ -118,7 +118,7 @@ async fn stream_speaker(
118
  msg = socket.next() => {
119
  match msg.as_ref() {
120
  Some(Ok(Message::Binary(bin))) => {
121
- let _ = whisper.send(bin.to_vec()).await; // whisper test
122
  // if let Err(e) = origin_tx.send(bin.to_vec()).await {
123
  // tracing::warn!("failed to send voice: {}", e);
124
  // break;
@@ -173,7 +173,7 @@ async fn stream_listener(
173
  ws: WebSocket,
174
  ) -> impl IntoResponse {
175
  let lesson_opt = ctx.lessons_manager.get_lesson(query.id).await;
176
- tracing::debug!("listener param = {:?}", query);
177
 
178
  ws.on_upgrade(|mut socket| async move {
179
  let voice_id = match query.voice.parse() {
 
104
  ws.on_upgrade(|mut socket| async move {
105
  let _origin_tx = lesson.voice_channel();
106
  let mut transcribe_rx = lesson.transcript_channel();
107
+ let mut whisper = WhisperHandler::new(SETTINGS.whisper.clone(), prompt)
108
  .expect("failed to create whisper");
109
  let mut whisper_transcribe_rx = whisper.subscribe();
110
  loop {
 
118
  msg = socket.next() => {
119
  match msg.as_ref() {
120
  Some(Ok(Message::Binary(bin))) => {
121
+ let _ = whisper.send_bytes(bin.to_vec()).await; // whisper test
122
  // if let Err(e) = origin_tx.send(bin.to_vec()).await {
123
  // tracing::warn!("failed to send voice: {}", e);
124
  // break;
 
173
  ws: WebSocket,
174
  ) -> impl IntoResponse {
175
  let lesson_opt = ctx.lessons_manager.get_lesson(query.id).await;
176
+ debug!("listener param = {:?}", query);
177
 
178
  ws.on_upgrade(|mut socket| async move {
179
  let voice_id = match query.voice.parse() {
src/whisper.rs CHANGED
@@ -1,24 +1,26 @@
1
  use std::{
2
  collections::VecDeque,
3
- ffi::c_int,
4
  fmt::{Debug, Display, Formatter},
5
  thread::sleep,
6
  time::Duration,
7
  };
 
8
 
9
  use once_cell::sync::Lazy;
10
  use tokio::sync::{broadcast, mpsc, oneshot};
11
- use tracing::{debug, trace};
12
- use whisper_rs::{convert_integer_to_float_audio, WhisperContext, WhisperState, WhisperToken};
13
- use whisper_rs_sys::WHISPER_SAMPLE_RATE;
14
 
15
  use crate::config::{Settings, SETTINGS};
16
  use crate::{config::WhisperConfig, group::GroupedWithin};
17
 
 
 
18
  static WHISPER_CONTEXT: Lazy<WhisperContext> = Lazy::new(|| {
19
  let settings = Settings::new().expect("Failed to initialize settings.");
20
  if tracing::enabled!(tracing::Level::DEBUG) {
21
- let info = print_system_info();
22
  debug!("system_info: n_threads = {} / {} | {}\n",
23
  settings.whisper.params.n_threads.unwrap_or(0),
24
  std::thread::available_parallelism().map(|c| c.get()).unwrap_or(0),
@@ -27,13 +29,6 @@ static WHISPER_CONTEXT: Lazy<WhisperContext> = Lazy::new(|| {
27
  WhisperContext::new(&settings.whisper.model).expect("failed to create WhisperContext")
28
  });
29
 
30
- fn print_system_info() -> String {
31
- unsafe {
32
- let raw_info = whisper_rs_sys::whisper_print_system_info();
33
- let info = std::ffi::CStr::from_ptr(raw_info);
34
- info.to_str().unwrap_or("failed to get system info").to_string()
35
- }
36
- }
37
 
38
  #[derive(Debug)]
39
  pub(crate) enum Error {
@@ -70,16 +65,21 @@ impl std::error::Error for Error {
70
  }
71
  }
72
 
73
- fn pcm_i16_to_f32(input: &[u8]) -> Vec<f32> {
74
- let pcm_i16 = input
75
  .chunks_exact(2)
76
  .map(|chunk| {
77
  let mut buf = [0u8; 2];
78
  buf.copy_from_slice(chunk);
79
  i16::from_le_bytes(buf)
80
  })
81
- .collect::<Vec<i16>>();
82
- convert_integer_to_float_audio(pcm_i16.as_slice())
 
 
 
 
 
83
  }
84
 
85
  #[derive(Clone, Debug)]
@@ -87,20 +87,21 @@ pub struct Segment {
87
  pub start_timestamp: i64,
88
  pub end_timestamp: i64,
89
  pub text: String,
90
- tokens: Vec<c_int>,
91
  }
92
 
93
  pub struct WhisperHandler {
94
- tx: mpsc::Sender<Vec<u8>>,
95
- transcription_tx: broadcast::Sender<Vec<Segment>>,
96
  stop_handle: Option<oneshot::Sender<()>>,
97
  }
98
 
99
  impl WhisperHandler {
100
  pub(crate) fn new(config: WhisperConfig, prompt: String) -> Result<Self, Error> {
 
101
  let (stop_handle, mut stop_signal) = oneshot::channel();
102
- let (pcm_tx, pcm_rx) = mpsc::channel::<Vec<u8>>(128);
103
- let (transcription_tx, _) = broadcast::channel::<Vec<Segment>>(128);
104
  let shared_transcription_tx = transcription_tx.clone();
105
  let state = WHISPER_CONTEXT
106
  .create_state()
@@ -109,21 +110,46 @@ impl WhisperHandler {
109
  .tokenize(prompt.as_str(), SETTINGS.whisper.max_prompt_tokens)
110
  .map_err(|e| Error::whisper_error("failed to tokenize prompt", e))?;
111
  tokio::task::spawn_blocking(move || {
 
 
112
  let mut detector = Detector::new(state, &SETTINGS.whisper, preset_prompt_tokens);
113
  let mut grouped = GroupedWithin::new(
114
- detector.n_samples_step * 2,
115
  Duration::from_millis(config.step_ms as u64),
116
  pcm_rx,
117
  u16::MAX as usize,
118
  );
119
  while let Err(oneshot::error::TryRecvError::Empty) = stop_signal.try_recv() {
 
 
 
 
 
 
 
 
 
120
  let new_pcm_f32 = match grouped.next() {
121
  Err(mpsc::error::TryRecvError::Disconnected) => break,
122
  Err(mpsc::error::TryRecvError::Empty) => {
123
  sleep(Duration::from_millis(10));
124
  continue;
125
  }
126
- Ok(data) => pcm_i16_to_f32(&data),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  };
128
 
129
  detector.feed(new_pcm_f32);
@@ -135,26 +161,22 @@ impl WhisperHandler {
135
  result
136
  }
137
  Err(err) => {
138
- tracing::warn!("failed to inference: {}", err);
139
  continue;
140
  }
141
  };
142
 
143
- for segment in segments.iter() {
144
- trace!(
145
- "[{}-{}]s SEGMENT: {}",
146
- segment.start_timestamp as f32 / 1000.0,
147
- segment.end_timestamp as f32 / 1000.0,
148
- segment.text
149
- );
150
- }
151
-
152
- if let Err(e) = shared_transcription_tx.send(segments) {
153
  tracing::error!("failed to send transcription: {}", e);
154
  break;
155
  };
156
  }
157
  });
 
158
  Ok(Self {
159
  tx: pcm_tx,
160
  transcription_tx,
@@ -162,27 +184,34 @@ impl WhisperHandler {
162
  })
163
  }
164
 
165
- pub fn subscribe(&self) -> broadcast::Receiver<Vec<Segment>> {
166
  self.transcription_tx.subscribe()
167
  }
168
 
169
- pub async fn send(&self, data: Vec<u8>) -> Result<(), mpsc::error::SendError<Vec<u8>>> {
170
  self.tx.send(data).await
171
  }
 
 
 
 
 
172
  }
173
 
174
  #[allow(dead_code)]
175
  struct Detector {
176
  state: WhisperState<'static>,
177
  config: &'static WhisperConfig,
 
 
 
178
  preset_prompt_tokens: Vec<WhisperToken>,
179
  n_samples_keep: usize,
180
  n_samples_step: usize,
181
  n_samples_len: usize,
182
- prompt_tokens: Vec<c_int>,
183
  pcm_f32: VecDeque<f32>,
184
  offset: usize,
185
- stable_offset: usize,
186
  }
187
 
188
  impl Detector {
@@ -194,14 +223,16 @@ impl Detector {
194
  Detector {
195
  state,
196
  config,
 
 
 
197
  preset_prompt_tokens,
198
- n_samples_keep: (config.keep_ms * WHISPER_SAMPLE_RATE / 1000) as usize,
199
- n_samples_step: (config.step_ms * WHISPER_SAMPLE_RATE / 1000) as usize,
200
- n_samples_len: (config.length_ms * WHISPER_SAMPLE_RATE / 1000) as usize,
201
  prompt_tokens: Default::default(),
202
- pcm_f32: VecDeque::from(vec![0f32; 30 * WHISPER_SAMPLE_RATE as usize]),
203
  offset: 0,
204
- stable_offset: 0,
205
  }
206
  }
207
 
@@ -210,20 +241,16 @@ impl Detector {
210
  if self.pcm_f32.len() < self.n_samples_len {
211
  return;
212
  }
213
- let len_to_drain = self
214
- .pcm_f32
215
- .drain(0..(self.pcm_f32.len() - self.n_samples_len))
216
- .len();
217
- self.offset += len_to_drain;
 
218
  }
219
 
220
  fn inference(&mut self) -> Result<Vec<Segment>, Error> {
221
- let prompt_tokens = [
222
- self.preset_prompt_tokens.as_slice(),
223
- self.prompt_tokens.as_slice(),
224
- ]
225
- .concat();
226
- let params = self.config.params.to_full_params(prompt_tokens.as_slice());
227
  let start = std::time::Instant::now();
228
  let _ = self
229
  .state
@@ -231,35 +258,32 @@ impl Detector {
231
  .map_err(|e| Error::whisper_error("failed to initialize WhisperState", e))?;
232
  let end = std::time::Instant::now();
233
  if end - start > Duration::from_millis(self.config.step_ms as u64) {
234
- tracing::warn!(
235
- "full([{}]) took {} ms too slow",
236
- self.pcm_f32.len(),
237
- (end - start).as_millis()
238
- );
239
  }
240
 
241
- let timestamp_offset: i64 = (self.offset * 1000 / WHISPER_SAMPLE_RATE as usize) as i64;
242
- let stable_offset: i64 = (self.stable_offset * 1000 / WHISPER_SAMPLE_RATE as usize) as i64;
243
  let num_segments = self
244
  .state
245
  .full_n_segments()
246
  .map_err(|e| Error::whisper_error("failed to get number of segments", e))?;
247
  let mut segments: Vec<Segment> = Vec::with_capacity(num_segments as usize);
248
  for i in 0..num_segments {
249
- let end_timestamp: i64 = timestamp_offset
250
- + 10 * self
251
- .state
252
- .full_get_segment_t1(i)
253
- .map_err(|e| Error::whisper_error("failed to get end timestamp", e))?;
254
- if end_timestamp <= stable_offset {
255
- continue;
256
- }
257
-
258
  let start_timestamp: i64 = timestamp_offset
259
  + 10 * self
260
  .state
261
  .full_get_segment_t0(i)
262
  .map_err(|e| Error::whisper_error("failed to get start timestamp", e))?;
 
 
 
 
 
 
 
263
  let segment = self
264
  .state
265
  .full_get_segment_text(i)
@@ -270,11 +294,9 @@ impl Detector {
270
  .map_err(|e| Error::whisper_error("failed to get segment tokens", e))?;
271
  let mut segment_tokens = Vec::with_capacity(num_tokens as usize);
272
  for j in 0..num_tokens {
273
- segment_tokens.push(
274
- self.state
275
- .full_get_token_id(i, j)
276
- .map_err(|e| Error::whisper_error("failed to get token", e))?,
277
- );
278
  }
279
 
280
  segments.push(Segment {
@@ -285,52 +307,132 @@ impl Detector {
285
  });
286
  }
287
 
288
- let Some((_last, init)) = segments.split_last() else {
289
- return Ok(Vec::default());
290
- };
291
 
292
- let Some((last_2_seg, _)) = init.split_last() else {
293
- return Ok(Vec::default());
 
294
  };
295
 
296
- let offset = (last_2_seg.end_timestamp - timestamp_offset) as usize / 1000
297
- * WHISPER_SAMPLE_RATE as usize;
298
- self.stable_offset = offset;
299
- self.drop_stable_by_segments(init);
300
- Ok(init.into())
301
- }
302
 
303
- fn drop_stable_by_segments(&mut self, stable_segments: &[Segment]) {
304
- let Some(last) = stable_segments.last() else {
305
- return;
306
- };
307
- let drop_offset: usize =
308
- last.end_timestamp as usize / 1000 * WHISPER_SAMPLE_RATE as usize - self.offset;
309
- if drop_offset > self.pcm_f32.len() {
310
- return; // Arithmetic overflow
311
  }
312
- let len_to_drain = self.pcm_f32.drain(0..drop_offset).len();
313
- self.offset += len_to_drain;
314
 
315
- for segment in stable_segments.iter() {
316
- self.prompt_tokens.extend(&segment.tokens);
317
- }
318
- if self.prompt_tokens.len() > self.config.max_prompt_tokens {
319
- let _ = self
320
- .prompt_tokens
321
- .drain(0..(self.prompt_tokens.len() - self.config.max_prompt_tokens))
322
- .len();
 
 
 
 
 
 
 
323
  }
 
 
 
 
324
  }
325
  }
326
 
327
  impl Drop for WhisperHandler {
328
  fn drop(&mut self) {
329
  let Some(stop_handle) = self.stop_handle.take() else {
330
- return tracing::warn!("WhisperHandler::drop() called without stop_handle");
331
  };
332
  if stop_handle.send(()).is_err() {
333
- tracing::warn!("WhisperHandler::drop() failed to send stop signal");
334
  }
335
  }
336
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  use std::{
2
  collections::VecDeque,
 
3
  fmt::{Debug, Display, Formatter},
4
  thread::sleep,
5
  time::Duration,
6
  };
7
+ use fvad::SampleRate;
8
 
9
  use once_cell::sync::Lazy;
10
  use tokio::sync::{broadcast, mpsc, oneshot};
11
+ use tokio::time::Instant;
12
+ use tracing::{debug, trace, warn};
13
+ use whisper_rs::{convert_integer_to_float_audio, WhisperContext, WhisperState, WhisperToken, WhisperTokenData};
14
 
15
  use crate::config::{Settings, SETTINGS};
16
  use crate::{config::WhisperConfig, group::GroupedWithin};
17
 
18
+ const WHISPER_SAMPLE_RATE: usize = whisper_rs_sys::WHISPER_SAMPLE_RATE as usize;
19
+
20
  static WHISPER_CONTEXT: Lazy<WhisperContext> = Lazy::new(|| {
21
  let settings = Settings::new().expect("Failed to initialize settings.");
22
  if tracing::enabled!(tracing::Level::DEBUG) {
23
+ let info = whisper_rs::print_system_info();
24
  debug!("system_info: n_threads = {} / {} | {}\n",
25
  settings.whisper.params.n_threads.unwrap_or(0),
26
  std::thread::available_parallelism().map(|c| c.get()).unwrap_or(0),
 
29
  WhisperContext::new(&settings.whisper.model).expect("failed to create WhisperContext")
30
  });
31
 
 
 
 
 
 
 
 
32
 
33
  #[derive(Debug)]
34
  pub(crate) enum Error {
 
65
  }
66
  }
67
 
68
+ fn u8_to_i16(input: &[u8]) -> Vec<i16> {
69
+ input
70
  .chunks_exact(2)
71
  .map(|chunk| {
72
  let mut buf = [0u8; 2];
73
  buf.copy_from_slice(chunk);
74
  i16::from_le_bytes(buf)
75
  })
76
+ .collect::<Vec<i16>>()
77
+ }
78
+
79
+ #[derive(Clone, Debug)]
80
+ pub enum Output {
81
+ Unstable(Segment),
82
+ Stable(Segment),
83
  }
84
 
85
  #[derive(Clone, Debug)]
 
87
  pub start_timestamp: i64,
88
  pub end_timestamp: i64,
89
  pub text: String,
90
+ tokens: Vec<WhisperTokenData>,
91
  }
92
 
93
  pub struct WhisperHandler {
94
+ tx: mpsc::Sender<Vec<i16>>,
95
+ transcription_tx: broadcast::Sender<Vec<Output>>,
96
  stop_handle: Option<oneshot::Sender<()>>,
97
  }
98
 
99
  impl WhisperHandler {
100
  pub(crate) fn new(config: WhisperConfig, prompt: String) -> Result<Self, Error> {
101
+ let vad_slice_size = WHISPER_SAMPLE_RATE / 100 * 3;
102
  let (stop_handle, mut stop_signal) = oneshot::channel();
103
+ let (pcm_tx, pcm_rx) = mpsc::channel::<Vec<i16>>(128);
104
+ let (transcription_tx, _) = broadcast::channel::<Vec<Output>>(128);
105
  let shared_transcription_tx = transcription_tx.clone();
106
  let state = WHISPER_CONTEXT
107
  .create_state()
 
110
  .tokenize(prompt.as_str(), SETTINGS.whisper.max_prompt_tokens)
111
  .map_err(|e| Error::whisper_error("failed to tokenize prompt", e))?;
112
  tokio::task::spawn_blocking(move || {
113
+ let mut vad = fvad::Fvad::new().expect("failed to create VAD")
114
+ .set_sample_rate(SampleRate::Rate16kHz);
115
  let mut detector = Detector::new(state, &SETTINGS.whisper, preset_prompt_tokens);
116
  let mut grouped = GroupedWithin::new(
117
+ detector.n_samples_step,
118
  Duration::from_millis(config.step_ms as u64),
119
  pcm_rx,
120
  u16::MAX as usize,
121
  );
122
  while let Err(oneshot::error::TryRecvError::Empty) = stop_signal.try_recv() {
123
+ if detector.has_crossed_next_line() {
124
+ if let Some(segment) = detector.next_line() {
125
+ let segments = vec![Output::Stable(segment)];
126
+ if let Err(e) = shared_transcription_tx.send(segments) {
127
+ tracing::error!("failed to send transcription: {}", e);
128
+ break;
129
+ };
130
+ }
131
+ }
132
  let new_pcm_f32 = match grouped.next() {
133
  Err(mpsc::error::TryRecvError::Disconnected) => break,
134
  Err(mpsc::error::TryRecvError::Empty) => {
135
  sleep(Duration::from_millis(10));
136
  continue;
137
  }
138
+ Ok(data) => {
139
+ let active_voice = data
140
+ .chunks(vad_slice_size)
141
+ .filter(|frame| {
142
+ if frame.len() != vad_slice_size {
143
+ true
144
+ } else {
145
+ vad.is_voice_frame(frame).unwrap_or(true)
146
+ }
147
+ // true
148
+ })
149
+ .collect::<Vec<_>>()
150
+ .concat();
151
+ convert_integer_to_float_audio(&active_voice)
152
+ },
153
  };
154
 
155
  detector.feed(new_pcm_f32);
 
161
  result
162
  }
163
  Err(err) => {
164
+ warn!("failed to inference: {}", err);
165
  continue;
166
  }
167
  };
168
 
169
+ let outputs = segments
170
+ .iter()
171
+ .map(|segment| Output::Unstable(segment.clone()))
172
+ .collect::<Vec<_>>();
173
+ if let Err(e) = shared_transcription_tx.send(outputs) {
 
 
 
 
 
174
  tracing::error!("failed to send transcription: {}", e);
175
  break;
176
  };
177
  }
178
  });
179
+
180
  Ok(Self {
181
  tx: pcm_tx,
182
  transcription_tx,
 
184
  })
185
  }
186
 
187
+ pub fn subscribe(&self) -> broadcast::Receiver<Vec<Output>> {
188
  self.transcription_tx.subscribe()
189
  }
190
 
191
+ pub async fn send_i16(&mut self, data: Vec<i16>) -> Result<(), mpsc::error::SendError<Vec<i16>>> {
192
  self.tx.send(data).await
193
  }
194
+
195
+ pub async fn send_bytes(&mut self, data: Vec<u8>) -> Result<(), mpsc::error::SendError<Vec<i16>>> {
196
+ let i16_data = u8_to_i16(&data);
197
+ self.send_i16(i16_data).await
198
+ }
199
  }
200
 
201
  #[allow(dead_code)]
202
  struct Detector {
203
  state: WhisperState<'static>,
204
  config: &'static WhisperConfig,
205
+ start_time: Instant,
206
+ segment: Option<Segment>,
207
+ line_num: usize,
208
  preset_prompt_tokens: Vec<WhisperToken>,
209
  n_samples_keep: usize,
210
  n_samples_step: usize,
211
  n_samples_len: usize,
212
+ prompt_tokens: Vec<WhisperToken>,
213
  pcm_f32: VecDeque<f32>,
214
  offset: usize,
 
215
  }
216
 
217
  impl Detector {
 
223
  Detector {
224
  state,
225
  config,
226
+ start_time: Instant::now(),
227
+ segment: None,
228
+ line_num: 0,
229
  preset_prompt_tokens,
230
+ n_samples_keep: config.keep_ms * WHISPER_SAMPLE_RATE / 1000,
231
+ n_samples_step: config.step_ms * WHISPER_SAMPLE_RATE / 1000,
232
+ n_samples_len: config.length_ms * WHISPER_SAMPLE_RATE / 1000,
233
  prompt_tokens: Default::default(),
234
+ pcm_f32: VecDeque::with_capacity(config.length_ms * WHISPER_SAMPLE_RATE / 1000),
235
  offset: 0,
 
236
  }
237
  }
238
 
 
241
  if self.pcm_f32.len() < self.n_samples_len {
242
  return;
243
  }
244
+ // let len_to_drain = self
245
+ // .pcm_f32
246
+ // .drain(0..(self.pcm_f32.len() - self.n_samples_len))
247
+ // .len();
248
+ // warn!("ASR too slow, drain {} samples", len_to_drain);
249
+ // self.offset += len_to_drain;
250
  }
251
 
252
  fn inference(&mut self) -> Result<Vec<Segment>, Error> {
253
+ let params = self.config.params.to_full_params(self.prompt_tokens.as_slice());
 
 
 
 
 
254
  let start = std::time::Instant::now();
255
  let _ = self
256
  .state
 
258
  .map_err(|e| Error::whisper_error("failed to initialize WhisperState", e))?;
259
  let end = std::time::Instant::now();
260
  if end - start > Duration::from_millis(self.config.step_ms as u64) {
261
+ // warn!(
262
+ // "full([{}]) took {} ms too slow",
263
+ // self.pcm_f32.len(),
264
+ // (end - start).as_millis()
265
+ // );
266
  }
267
 
268
+ let timestamp_offset: i64 = (self.offset * 1000 / WHISPER_SAMPLE_RATE) as i64;
 
269
  let num_segments = self
270
  .state
271
  .full_n_segments()
272
  .map_err(|e| Error::whisper_error("failed to get number of segments", e))?;
273
  let mut segments: Vec<Segment> = Vec::with_capacity(num_segments as usize);
274
  for i in 0..num_segments {
 
 
 
 
 
 
 
 
 
275
  let start_timestamp: i64 = timestamp_offset
276
  + 10 * self
277
  .state
278
  .full_get_segment_t0(i)
279
  .map_err(|e| Error::whisper_error("failed to get start timestamp", e))?;
280
+
281
+ let end_timestamp: i64 = timestamp_offset
282
+ + 10 * self
283
+ .state
284
+ .full_get_segment_t1(i)
285
+ .map_err(|e| Error::whisper_error("failed to get end timestamp", e))?;
286
+
287
  let segment = self
288
  .state
289
  .full_get_segment_text(i)
 
294
  .map_err(|e| Error::whisper_error("failed to get segment tokens", e))?;
295
  let mut segment_tokens = Vec::with_capacity(num_tokens as usize);
296
  for j in 0..num_tokens {
297
+ let token_data = self.state.full_get_token_data(i, j)
298
+ .map_err(|e| Error::whisper_error("failed to get token data", e))?;
299
+ segment_tokens.push(token_data);
 
 
300
  }
301
 
302
  segments.push(Segment {
 
307
  });
308
  }
309
 
310
+ self.segment = segments.first().cloned();
311
+ Ok(segments.to_vec())
312
+ }
313
 
314
+ fn remember_prompt(&mut self) {
315
+ let Some(segment) = self.segment.as_ref() else {
316
+ return
317
  };
318
 
319
+ let tokens = segment
320
+ .tokens
321
+ .iter()
322
+ .map(|td| td.tid)
323
+ .collect::<Vec<WhisperToken>>();
 
324
 
325
+ self.prompt_tokens.extend(tokens);
326
+ if self.prompt_tokens.len() > self.config.max_prompt_tokens {
327
+ let _ = self.prompt_tokens.drain(0..(self.prompt_tokens.len() - self.config.max_prompt_tokens)).len();
 
 
 
 
 
328
  }
329
+ }
 
330
 
331
+ fn has_crossed_next_line(&self) -> bool {
332
+ let now = Instant::now();
333
+ let elapsed = now - self.start_time;
334
+ let line_number: usize = (elapsed.as_millis() / self.config.length_ms as u128) as usize;
335
+ line_number > self.line_num
336
+ }
337
+
338
+ fn next_line(&mut self) -> Option<Segment> {
339
+ if self.pcm_f32.len() > self.n_samples_keep {
340
+ let drain_size = self.pcm_f32.drain(0..(self.pcm_f32.len() - self.n_samples_keep)).len();
341
+ self.offset += drain_size;
342
+ } else {
343
+ let size_will_clear = self.pcm_f32.len();
344
+ self.pcm_f32.clear();
345
+ self.offset += size_will_clear;
346
  }
347
+
348
+ self.line_num += 1;
349
+ self.remember_prompt();
350
+ self.segment.take()
351
  }
352
  }
353
 
354
  impl Drop for WhisperHandler {
355
  fn drop(&mut self) {
356
  let Some(stop_handle) = self.stop_handle.take() else {
357
+ return warn!("WhisperHandler::drop() called without stop_handle");
358
  };
359
  if stop_handle.send(()).is_err() {
360
+ warn!("WhisperHandler::drop() failed to send stop signal");
361
  }
362
  }
363
  }
364
+
365
+ #[cfg(test)]
366
+ mod test {
367
+ use super::*;
368
+ use std::io::{stdout, Write};
369
+ use hound;
370
+ use tracing_test;
371
+ use tracing::info;
372
+
373
+ async fn print_output(output: Output) {
374
+ match output {
375
+ Output::Stable(stable) => {
376
+ print!("\x1b[2K\r");
377
+ print!("{}\n", stable.text);
378
+ },
379
+ Output::Unstable(unstable) => {
380
+ // back to previous line of console
381
+ print!("\x1b[2K\r");
382
+ print!("{}", " ".repeat(100));
383
+ print!("\x1b[2K\r");
384
+ print!("{} ...", unstable.text);
385
+ }
386
+ }
387
+ stdout().flush().unwrap();
388
+ }
389
+ #[tokio::test]
390
+ #[tracing_test::traced_test]
391
+ async fn test_whisper_handler() {
392
+ let mut whisper_handler = WhisperHandler::new(
393
+ SETTINGS.whisper.clone(),
394
+ "Harry Potter and the Philosopher's Stone".to_string(),
395
+ ).expect("failed to create WhisperHandler");
396
+
397
+ let wav = hound::WavReader::open("samples/ADHD_1A.wav")
398
+ .expect("failed to open wav");
399
+ let spec = wav.spec();
400
+ println!("{:?}", spec);
401
+ let samples = wav
402
+ .into_samples::<i16>()
403
+ .map(|s| s.unwrap())
404
+ .collect::<Vec<i16>>();
405
+ let chunks = samples.chunks(1600)
406
+ .map(|chunk| chunk.to_vec())
407
+ .into_iter();
408
+
409
+ let mut rx = whisper_handler.subscribe();
410
+ let send_fut = async {
411
+ // tokio::time::sleep(Duration::from_secs(5)).await;
412
+ for chunk in chunks {
413
+ let _ = whisper_handler.send_i16(chunk).await.expect("failed to send sample");
414
+ tokio::time::sleep(Duration::from_millis(100)).await;
415
+ }
416
+ };
417
+
418
+ let recv_fut = async {
419
+ while let Ok(outputs) = rx.recv().await {
420
+ let Some(output) = outputs.first() else {
421
+ continue
422
+ };
423
+
424
+ match output {
425
+ Output::Stable(stable) => {
426
+ println!("{}", stable.text);
427
+ },
428
+ Output::Unstable(unstable) => {
429
+
430
+ }
431
+ }
432
+
433
+ }
434
+ };
435
+
436
+ tokio::join!(send_fut, recv_fut);
437
+ }
438
+ }