(defvar my-live-speech-buffer "*Speech*")
(defvar my-live-speech-process nil)
(defvar my-live-speech-output-buffer "*Speech JSON*")
(defvar my-live-speech-functions
'(my-live-speech-display-in-speech-buffer
my-live-speech-display-wpm
my-live-speech-append-to-etherpad)
"Functions to call with one argument, the recognition results.")
(defun my-live-speech-start ()
"Turn on live captions."
(interactive)
(with-current-buffer (get-buffer-create my-live-speech-buffer)
(unless (process-live-p my-live-speech-process)
(let ((default-directory "~/proj/deepgram-live"))
(message "%s" default-directory)
(with-current-buffer (get-buffer-create my-live-speech-output-buffer)
(erase-buffer))
(setq my-live-speech-recent-words nil
my-live-speech-wpm-string "READY ")
(setq my-deepgram-process
(make-process
:command '("bash" "run.sh")
:name "speech"
:filter 'my-live-speech-json-filter
:sentinel #'my-live-speech-process-sentinel
:buffer my-live-speech-output-buffer)))
(org-mode))
(display-buffer (current-buffer))))
(defun my-live-speech-stop ()
(interactive)
(if (process-live-p my-live-speech-process)
(kill-process my-live-speech-process))
(setq my-live-speech-wpm-string nil))
(defun my-live-speech-handle-json (line-object)
"Process the JSON object in LINE."
(run-hook-with-args 'my-live-speech-functions (json-parse-string line :object-type 'alist)))
(defun my-live-speech-process-sentinel (proc event)
(when (string-match "finished" event)
(my-live-speech-stop)
))
(defun my-live-speech-json-filter (proc string)
(when (buffer-live-p (process-buffer proc))
(with-current-buffer (process-buffer proc)
(let* ((proc-mark (process-mark proc))
(moving (= (point) proc-mark)))
(save-excursion
(goto-char proc-mark)
(insert string)
(set-marker proc-mark (point)))
(if moving (goto-char proc-mark))
(let ((pos (point-min)))
(while (progn (goto-char pos)
(end-of-line)
(equal (following-char) ?\n))
(let* ((end (point))
(line (buffer-substring pos end)))
(delete-region pos (+ end 1))
(with-current-buffer (get-buffer my-live-speech-buffer)
(my-live-speech-handle-json line)))))))))
Python code based on the Deepgram streaming test suite:
Very rough app.py
import pyaudio
import asyncio
import json
import os
import websockets
from datetime import datetime
import wave
import sys
startTime = datetime.now()
key = os.environ['DEEPGRAM_API_KEY']
live_json = os.environ.get('LIVE_CAPTIONS_JSON', True)
all_mic_data = []
all_transcripts = []
all_words = []
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 8000
audio_queue = asyncio.Queue()
REALTIME_RESOLUTION = 0.250
SAMPLE_SIZE = 0
def save_info():
global SAMPLE_SIZE
base = startTime.strftime('%Y%m%d%H%M')
wave_file_path = os.path.abspath(f"{base}.wav")
wave_file = wave.open(wave_file_path, "wb")
wave_file.setnchannels(CHANNELS)
wave_file.setsampwidth(SAMPLE_SIZE)
wave_file.setframerate(RATE)
wave_file.writeframes(b"".join(all_mic_data))
wave_file.close()
with open(f"{base}.txt", "w") as f:
f.write("\n".join(all_transcripts))
with open(f"{base}.json", "w") as f:
f.write(json.dumps(all_words))
if live_json:
print(f'{{"msg": "🟢 Saved to {base}.txt , {base}.json , {base}.wav", "base": "{base}"}}')
else:
print(f"🟢 Saved to {base}.txt , {base}.json , {base}.wav")
def mic_callback(input_data, frame_count, time_info, status_flag):
audio_queue.put_nowait(input_data)
return (input_data, pyaudio.paContinue)
async def run(key, method="mic", format="text", **kwargs):
deepgram_url = f'wss://api.deepgram.com/v1/listen?punctuate=true&smart_format=true&utterances=true&encoding=linear16&sample_rate=16000'
async with websockets.connect(
deepgram_url, extra_headers={"Authorization": "Token {}".format(key)}
) as ws:
async def sender(ws):
try:
while True:
mic_data = await audio_queue.get()
all_mic_data.append(mic_data)
await ws.send(mic_data)
except websockets.exceptions.ConnectionClosedOK:
await ws.send(json.dumps({"type": "CloseStream"}))
if live_json:
print('{"msg": "Closed."}')
else:
print("Closed.")
async def receiver(ws):
global all_words
"""Print out the messages received from the server."""
first_message = True
first_transcript = True
transcript = ""
async for msg in ws:
res = json.loads(msg)
if first_message:
first_message = False
try:
if res.get("msg"):
if live_json:
print(json.dumps(res))
else:
print(res["msg"])
if res.get("is_final"):
transcript = (
res.get("channel", {})
.get("alternatives", [{}])[0]
.get("transcript", "")
)
if transcript != "":
if first_transcript:
first_transcript = False
if live_json:
print(json.dumps(res.get("channel", {}).get("alternatives", [{}])[0]))
else:
print(transcript)
all_transcripts.append(transcript)
all_words = all_words + res.get("channel", {}).get("alternatives", [{}])[0].get("words", [])
if method == "mic" and "goodbye" in transcript.lower():
await ws.send(json.dumps({"type": "CloseStream"}))
if live_json:
print('{"msg": "Done."}')
else:
print("Done.")
if res.get("created"):
save_info()
except KeyError:
print(f"🔴 ERROR: Received unexpected API response! {msg}")
async def microphone():
audio = pyaudio.PyAudio()
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=mic_callback,
)
stream.start_stream()
global SAMPLE_SIZE
SAMPLE_SIZE = audio.get_sample_size(FORMAT)
while stream.is_active():
await asyncio.sleep(0.1)
stream.stop_stream()
stream.close()
functions = [
asyncio.ensure_future(sender(ws)),
asyncio.ensure_future(receiver(ws)),
]
functions.append(asyncio.ensure_future(microphone()))
if live_json:
print('{"msg": "Ready."}')
else:
print("🟢 Ready.")
await asyncio.gather(*functions)
def main():
"""Entrypoint for the example."""
try:
asyncio.run(run(key, "mic", "text"))
except websockets.exceptions.InvalidStatusCode as e:
print(f'🔴 ERROR: Could not connect to Deepgram! {e.headers.get("dg-error")}')
print(
f'🔴 Please contact Deepgram Support (developers@deepgram.com) with request ID {e.headers.get("dg-request-id")}'
)
return
except websockets.exceptions.ConnectionClosedError as e:
error_description = f"Unknown websocket error."
print(
f"🔴 ERROR: Deepgram connection unexpectedly closed with code {e.code} and payload {e.reason}"
)
if e.reason == "DATA-0000":
error_description = "The payload cannot be decoded as audio. It is either not audio data or is a codec unsupported by Deepgram."
elif e.reason == "NET-0000":
error_description = "The service has not transmitted a Text frame to the client within the timeout window. This may indicate an issue internally in Deepgram's systems or could be due to Deepgram not receiving enough audio data to transcribe a frame."
elif e.reason == "NET-0001":
error_description = "The service has not received a Binary frame from the client within the timeout window. This may indicate an internal issue in Deepgram's systems, the client's systems, or the network connecting them."
print(f"🔴 {error_description}")
print(
f"🔴 Please contact Deepgram Support (developers@deepgram.com) with the request ID listed above."
)
return
except websockets.exceptions.ConnectionClosedOK:
return
except Exception as e:
print(f"🔴 ERROR: Something went wrong! {e}")
save_info()
return
if __name__ == "__main__":
sys.exit(main() or 0)
The Python script sends the microphone stream to Deepgram and prints
out the JSON output. The Emacs Lisp code starts an asynchronous
process and reads the JSON output, displaying the transcript and
calculating the WPM based on the words. run.sh just loads the venv for
this project (requirements.txt based on the streaming text suite) and
then runs app.py, since some of the Python library versions conflict
with other things I want to experiment with.
I also added
my-live-speech-wpm-string
to my mode-line-format
manually using
Customize, since I wanted it displayed on the left side instead of
getting lost when I turn keycast-mode
on.
I'm still a little anxious about accidentally leaving a process
running, so I check with ps aux | grep python3
. Eventually I'll
figure out how to make sure everything gets properly stopped when I'm
done.
Anyway, there it is!