Some profanity prevention stuff, breadmixer rewrite

2025-12-27 20:47:34 -05:00
parent 6e93022cb3
commit c4187c1590
7 changed files with 402 additions and 2 deletions
--- a/bin/breadmixer.py
+++ b/bin/breadmixer.py
@@ -0,0 +1,113 @@
+import json
+import os
+import copy
+from pathlib import Path
+from datetime import datetime
+from breadbot_common import SQLite, MySQL, TranscriptableFile, mix_audio_with_ffmpeg
+from txtai.pipeline import Transcription
+
+MAX_FILES_PER_CYCLE=50
+
+script_path = Path(__file__).resolve()
+config_path = Path(script_path.parent, "config.json")
+
+with open(config_path, 'r') as config_file:
+    config_json = json.loads(config_file.read())
+
+if config_json["db"]["type"].casefold() == "SQLITE".casefold():
+    db = SQLite(Path(script_path.parent.parent, config_json["db"]["db_path"]))
+else:
+    db = MySQL(
+        config_json["db"]["host"],
+        config_json["db"]["user"],
+        config_json["db"]["password"],
+        config_json["db"]["db_name"]
+    )
+
+calls_needing_work = db.query(
+    "SELECT * FROM db_call WHERE NOT call_end_time IS NULL AND call_consolidated = 0 AND call_transcribed = 0"    
+)
+
+if calls_needing_work[0] == 0:
+    print("No work to do, exiting")
+
+transcriber = Transcription("openai/whisper-base")
+
+for call in calls_needing_work[1]:
+    all_files = os.listdir(Path(
+        config_json["media_voice_folder"],
+        call[0]
+    ))
+
+    transcriptable_files = []
+
+    for file in all_files:
+        file_name_no_extension = file.split('.')[0]
+        timestamp = int(file_name_no_extension.split('-')[0])
+        user_snowflake = file_name_no_extension.split('-')[1]
+        file_stamp_as_datetime = datetime.fromtimestamp(timestamp / 1000)
+        time_diff = file_stamp_as_datetime - call[1]
+
+        transcriptable_files.append(TranscriptableFile(
+            file_path = file,
+            real_date = file_stamp_as_datetime,
+            milliseconds_from_start = int((time_diff.seconds * 1000) + (time_diff.microseconds / 1000)),
+            user_snowflake = user_snowflake
+        ))
+
+    transcriptable_files.sort(key=lambda a: a.milliseconds_from_start)
+
+    # TODO Possibly RAM abusive solution to wanting to keep the original list around
+    ffmpeg_files = copy.deepcopy(transcriptable_files)
+
+    # TODO Error handling for all ffmpeg operations
+    while len(ffmpeg_files) > MAX_FILES_PER_CYCLE:
+        ffmpeg_files = [
+            mix_audio_with_ffmpeg(
+                ffmpeg_files[index:min(index + MAX_FILES_PER_CYCLE, len(ffmpeg_files))],
+                config_json["media_voice_folder"],
+                call[0],
+                False
+            )
+            for index in range(0, len(ffmpeg_files), MAX_FILES_PER_CYCLE)
+        ]
+
+    final_pass_file = mix_audio_with_ffmpeg(
+        ffmpeg_files,
+        config_json["media_voice_folder"],
+        call[0],
+        True
+    )
+
+    db.update("db_call", ["call_consolidated"], [1, call[0]], [{
+        "name": "call_id",
+        "compare": "="
+    }])
+
+    for file in os.listdir(Path(config_json["media_voice_folder"], call[0])):
+        if file.startswith("intermediate"):
+            os.remove(Path(config_json["media_voice_folder"], call[0], file))
+
+    for file in transcriptable_files:
+        text = transcriber(file.file_path)
+
+        db.insert(
+            "db_call_transcriptions", 
+            ["speaking_start_time", "text", "callCallId", "userUserSnowflake"],
+            [file.real_date, text, call[0], file.user_snowflake]
+        )
+    
+    db.update("db_call", ["call_transcribed"], [1, call[0]], [{
+        "name": "call_id",
+        "compare": "="
+    }])
+
+
+
+    
+
+
+
+
+
+