Reimpl özden batch lm gen

coqui-ai · wasertech · May 12, 2022 · Jun 24, 2022 · Jun 24, 2022 · Jun 24, 2022
commit 22c8cd5c13451079a0dabb94eb38e1485c12c581
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -823,6 +823,9 @@ jobs:
           # Test FLAC input
           time ./bin/run-ci-ldc93s1-flac.sh --epochs 1
 
+          # Test LM gen
+          time ./bin/run-ci-lm-gen-batch.sh
+
           # Test LM opt
           time ./bin/run-ci-lm-opt.sh
   training-sdb-tests:

diff --git a/Dockerfile.train b/Dockerfile.train
@@ -39,6 +39,8 @@ RUN apt-get update && \
         libvorbisfile3 \
         libopusfile0 \
         libsndfile1 \
+        libboost-program-options-dev \
+        libboost-thread-dev \
         sox \
         libsox-fmt-mp3 \
         python3-venv \

diff --git a/bin/run-ci-lm-gen-batch.sh b/bin/run-ci-lm-gen-batch.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+# This test optimizes the scorer for testing purposes
+
+set -xe
+
+if [ ! -f lm_optimizer.py ]; then
+    echo "Please make sure you run this from STT's top level directory."
+    exit 1
+fi;
+
+
+
+lm_path="./data/lm"
+sources_lm_filepath="./data/smoke_test/vocab.txt"
+
+# Force only one visible device because we have a single-sample dataset
+# and when trying to run on multiple devices (like GPUs), this will break
+
+python data/lm/generate_lm_batch.py \
+    --input_txt "${sources_lm_filepath}" \
+    --output_dir "${lm_path}" \
+    --top_k_list 30000 \
+    --arpa_order_list "4" \
+    --max_arpa_memory "85%" \
+    --arpa_prune_list "0|0|2" \
+    --binary_a_bits 255 \
+    --binary_q_bits 8 \
+    --binary_type trie \
+    --kenlm_bins /code/kenlm/build/bin/ \
+    -j 1
diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py