-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpi_zero_transcriber.py
123 lines (100 loc) · 4.03 KB
/
pi_zero_transcriber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# This is a simple transcriber that uses the LiveKit SDK to transcribe audio from the microphone.
# It displays the transcribed text on a Pirate Audio display on a Raspberry Pi Zero 2 W.
from pathlib import Path
from dotenv import load_dotenv
from livekit.agents import JobContext, WorkerOptions, cli
from livekit.agents.voice import Agent, AgentSession
from livekit.plugins import deepgram
from pathlib import Path
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import st7789
import os
import signal
import time
import textwrap
# Load environment variables
load_dotenv(dotenv_path=Path(__file__).parent.parent / '.env')
# Set up the screen
SPI_SPEED_MHZ = 20
screen = st7789.ST7789(
rotation=90, # Needed to display the right way up on Pirate Audio
port=0, # SPI port
cs=1, # SPI port Chip-select channel
dc=9, # BCM pin used for data/command
backlight=13,
spi_speed_hz=SPI_SPEED_MHZ * 1000 * 1000
)
width = screen.width
height = screen.height
# Create image for display
image = Image.new("RGB", (240, 240), (0, 0, 0))
draw = ImageDraw.Draw(image)
# Set up font
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 18)
title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 22)
# Display startup screen
def show_startup_screen():
draw.rectangle((0, 0, width, height), fill=(0, 0, 0))
draw.text((10, 10), "LiveKit", font=title_font, fill=(255, 255, 255))
draw.text((10, 40), "Transcription", font=title_font, fill=(255, 255, 255))
draw.text((10, 80), "Starting...", font=font, fill=(200, 200, 200))
screen.display(image)
# Display transcription text
def display_transcription(text):
# Clear screen
draw.rectangle((0, 0, width, height), fill=(0, 0, 0))
# Add title
draw.text((10, 10), "Transcription", font=title_font, fill=(255, 255, 255))
# Wrap and display the transcribed text
y_position = 50
wrapped_text = textwrap.wrap(text, width=26) # Adjust width as needed
# Display only the most recent lines that fit on screen
max_lines = 9 # Approximate number of lines that fit
display_lines = wrapped_text[-max_lines:] if len(wrapped_text) > max_lines else wrapped_text
for line in display_lines:
draw.text((10, y_position), line, font=font, fill=(200, 200, 200))
y_position += 20 # Line spacing
screen.display(image)
async def entrypoint(ctx: JobContext):
show_startup_screen()
await ctx.connect()
session = AgentSession()
# Keep track of the current transcription
current_transcript = ""
last_transcript = ""
@session.on("user_input_transcribed")
def on_transcript(transcript):
nonlocal current_transcript, last_transcript
# Update the current transcript
if transcript.is_final:
# For final transcripts, update the full text
current_transcript += " " + transcript.transcript
current_transcript = current_transcript.strip()
# Save to file
with open("user_speech_log.txt", "a") as f:
f.write(f"{transcript.transcript}\n")
else:
# For interim results, show the current segment
last_transcript = transcript.transcript
# Display the combined text (completed transcript + current segment)
display_text = current_transcript
if not transcript.is_final and last_transcript:
display_text += " " + last_transcript
display_transcription(display_text)
await session.start(
agent=Agent(
instructions="You are a helpful assistant that transcribes user speech to text.",
stt=deepgram.STT()
),
room=ctx.room
)
if __name__ == "__main__":
try:
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
except KeyboardInterrupt:
# Clear screen on exit
draw.rectangle((0, 0, width, height), fill=(0, 0, 0))
screen.display(image)
print("\nExiting transcriber")