Streaming Enrollment
- The following example shows how to stream audio using VoiceBio’s
StreamingEnrollrequest and generate a voiceprint. The stream can come from a file on disk or be directly from a microphone in real time.
Streaming from an audio file
-
We support several headered file formats including WAV, MP3, FLAC etc. For more details, please see the protocol buffer specification here. For best accuracy, it is recommended to use an uncompressed / loss-less compression audio format like WAV or FLAC.
-
The examples below use a WAV file as input. We will query the server for available models and use the first model to generate the voiceprint.
-
Generated Voiceprints can be updated and made more robust by re-enrolling them with additional audio. Please see the re-enrollment section.
import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio
serverAddress = "localhost:2727"
# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)
# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)
# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())
print("Models:")
for model in modelResp.models:
print(model)
# Select a model ID from the list above. Going with the first model
# in this example.
modelID = modelResp.models[0].id
# Set the enrollment config. We don't set the audio format and let the
# server auto-detect the format from the file header.
cfg = voicebio.EnrollmentConfig(
model_id=modelID,
previous_voiceprint=None,
)
# The first request to the server should only contain the
# configuration. Subsequent requests should contain audio
# bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
yield voicebio.StreamingEnrollRequest(config=cfg)
data = audio.read(bufferSize)
while len(data) > 0:
yield voicebio.StreamingEnrollRequest(audio=voicebio.Audio(data=data))
data = audio.read(bufferSize)
# Streaming audio to the server.
with open("test.wav", "rb") as audio:
result = client.StreamingEnroll(stream(cfg, audio))
# A certain minimum duration of speech is required for completing enrollment.
# The enrollment status contains information on Whether that has been met or
# whether additional audio is required.
print(f"enrollment Status:\n{result.enrollment_status}\n")
# Saving the voiceprint data to a file. This can be provided again
# in another StreamingEnroll request (for continuing enrollment) or
# submitted for verification / identification requests.
with open("voiceprint.bin", 'w') as f:
f.write(result.voiceprint.data)package main
import (
"context"
"errors"
"fmt"
"io"
"os"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)
func main() {
const (
serverAddress = "localhost:2727"
)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
opts := []grpc.DialOption{
grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
grpc.WithBlock(),
grpc.WithReturnConnectionError(),
grpc.FailOnNonTempDialError(true),
}
conn, err := grpc.DialContext(ctx, serverAddress, opts...)
if err != nil {
fmt.Printf("failed to dial gRPC connection: %v\n", err)
os.Exit(1)
}
client := voicebio.NewVoiceBioServiceClient(conn)
// Get server version.
versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
if err != nil {
fmt.Printf("failed to get server version: %v\n", err)
os.Exit(1)
}
fmt.Printf("%v\n", versionResp)
// Get list model of models on the server.
modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
if err != nil {
fmt.Printf("failed to get model list: %v\n", err)
os.Exit(1)
}
fmt.Println("Models:")
for _, m := range modelResp.Models {
fmt.Println(m)
}
fmt.Println()
// Selecting the first model.
cfg := &voicebio.EnrollmentConfig{
ModelId: modelResp.Models[0].Id,
PreviousVoiceprint: nil,
}
// Opening audio file.
audio, err := os.Open("test.wav")
if err != nil {
fmt.Printf("failed to open audio file: %v\n", err)
os.Exit(1)
}
defer audio.Close()
// Starting enrollment.
result, err := StreamingEnroll(ctx, client, cfg, audio)
if err != nil {
fmt.Printf("failed to run streaming enrollment: %v\n", err)
os.Exit(1)
}
// A certain minimum duration of speech is required for completing enrollment.
// The enrollment status contains information on Whether that has been met or
// whether additional audio is required.
fmt.Printf("Enrollment Status: %v\n", result.EnrollmentStatus)
// Saving the voiceprint data to a file. This can be provided again
// in another StreamingEnroll request (for continuing enrollment) or
// submitted for verification / identification requests.
if err := os.WriteFile("voiceprint.bin", []byte(result.Voiceprint.Data), os.ModePerm); err != nil {
fmt.Printf("failed to write voiceprint data: %v\n", err)
os.Exit(1)
}
}
// StreamingEnroll wraps the streaming API for performing speaker enrollment
// (i.e. voiceprint generation) using the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingEnroll(
ctx context.Context,
client voicebio.VoiceBioServiceClient,
cfg *voicebio.EnrollmentConfig,
audio io.Reader,
) (*voicebio.StreamingEnrollResponse, error) {
const (
streamingBufSize = 1024
)
// Creating stream.
stream, err := client.StreamingEnroll(ctx)
if err != nil {
return nil, err
}
// Sending audio.
if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
// if sendAudio encountered io.EOF, it's only a
// notification that the stream has closed. The actual
// status will be obtained in the CloseAndRecv call. We
// therefore return on non-EOF errors here.
return nil, err
}
// Returning result.
return stream.CloseAndRecv()
}
// sendAudio sends the config and audio to a stream.
func sendAudio(
stream voicebio.VoiceBioService_StreamingEnrollClient,
cfg *voicebio.EnrollmentConfig,
audio io.Reader,
bufSize uint32,
) error {
// The first message needs to be a config message, and all subsequent
// messages must be audio messages.
// Send the config.
if err := stream.Send(&voicebio.StreamingEnrollRequest{
Request: &voicebio.StreamingEnrollRequest_Config{Config: cfg},
}); err != nil {
// if this failed, we don't need to CloseSend
return err
}
// Stream the audio.
buf := make([]byte, bufSize)
for {
n, err := audio.Read(buf)
if n > 0 {
if err2 := stream.Send(&voicebio.StreamingEnrollRequest{
Request: &voicebio.StreamingEnrollRequest_Audio{
Audio: &voicebio.Audio{Data: buf[:n]},
},
}); err2 != nil {
// if we couldn't Send, the stream has
// encountered an error and we don't need to
// CloseSend.
return err2
}
}
if err != nil {
// err could be io.EOF, or some other error reading from
// audio. In any case, we need to CloseSend, send the
// appropriate error to errCh and return from the function
if err2 := stream.CloseSend(); err2 != nil {
return err2
}
if err != io.EOF {
return err
}
return nil
}
}
}Streaming from microphone
-
Streaming audio from microphone input basically requires a reader interface that can provided audio samples recorded from a microphone; typically this requires interaction with system libraries. Another option is to use an external command line tool like
soxto record and pipe audio into the client. -
The examples below use the latter approach by using the
reccommand provided withsoxto record and stream the audio.
#!/usr/bin/env python3
# This example assumes sox is installed on the system and is available
# in the system's PATH variable. Instead of opening a regular file from
# disk, we open a subprocess that executes sox's rec command to record
# audio from the system's default microphone.
import subprocess
import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio
serverAddress = "localhost:2727"
# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)
# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)
# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())
print("Models:")
for model in modelResp.models:
print(model)
# Select a model ID from the list above. Going with the first model
# in this example.
m = modelResp.models[0]
modelID = m.id
# Setting audio format to be raw 16-bit signed little endian audio samples
# recorded at the sample rate expected by the model.
cfg = voicebio.EnrollmentConfig(
model_id=modelID,
previous_voiceprint=None,
audio_format=voicebio.AudioFormat(
audio_format_raw=voicebio.AudioFormatRAW(
encoding="AUDIO_ENCODING_SIGNED",
bit_depth=16,
byte_order="BYTE_ORDER_LITTLE_ENDIAN",
sample_rate=m.attributes.sample_rate,
channels=1,
)
),
)
# Open microphone stream using sox's rec command and record
# audio using the config specified above for *10 seconds*.
maxDuration = 10
cmd = f"rec -t raw -r {m.attributes.sample_rate} -e signed -b 16 -L -c 1 - trim 0 {maxDuration}"
mic = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
audio = mic.stdout
try:
_ = audio.read(1024) # Trying to read some bytes as sanity check.
except Exception as err:
print(f"[ERROR] failed to read audio from mic stream: {err}")
print(f"\n[INFO] recording {maxDuration} seconds of audio microphone ... \n")
# The first request to the server should only contain the
# recognition configuration. Subsequent requests should contain
# audio bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
yield voicebio.StreamingEnrollRequest(config=cfg)
data = audio.read(bufferSize)
while len(data) > 0:
yield voicebio.StreamingEnrollRequest(audio=voicebio.Audio(data=data))
data = audio.read(bufferSize)
# Streaming audio to the server.
result = client.StreamingEnroll(stream(cfg, audio))
# A certain minimum duration of speech is required for completing enrollment.
# The enrollment status contains information on Whether that has been met or
# whether additional audio is required.
print(f"enrollment Status:\n{result.enrollment_status}\n")
# Saving the voiceprint data to a file. This can be provided again
# in another StreamingEnroll request (for continuing enrollment) or
# submitted for verification / identification requests.
with open("voiceprint.bin", 'w') as f:
f.write(result.voiceprint.data)
audio.close()
mic.kill()package main
import (
"context"
"errors"
"fmt"
"io"
"os"
"os/exec"
"strings"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)
func main() {
const (
serverAddress = "localhost:2727"
)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
opts := []grpc.DialOption{
grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
grpc.WithBlock(),
grpc.WithReturnConnectionError(),
grpc.FailOnNonTempDialError(true),
}
conn, err := grpc.DialContext(ctx, serverAddress, opts...)
if err != nil {
fmt.Printf("failed to dial gRPC connection: %v\n", err)
os.Exit(1)
}
client := voicebio.NewVoiceBioServiceClient(conn)
// Get server version.
versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
if err != nil {
fmt.Printf("failed to get server version: %v\n", err)
os.Exit(1)
}
fmt.Printf("%v\n", versionResp)
// Get list model of models on the server.
modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
if err != nil {
fmt.Printf("failed to get model list: %v\n", err)
os.Exit(1)
}
fmt.Println("Models:")
for _, m := range modelResp.Models {
fmt.Println(m)
}
fmt.Println()
// Selecting first model.
m := modelResp.Models[0]
// Setting audio format to be raw 16-bit signed little endian audio samples
// recorded at the sample rate expected by the model.
cfg := &voicebio.EnrollmentConfig{
ModelId: m.Id,
PreviousVoiceprint: nil,
AudioFormat: &voicebio.AudioFormat{AudioFormat: &voicebio.AudioFormat_AudioFormatRaw{
AudioFormatRaw: &voicebio.AudioFormatRAW{
Encoding: voicebio.AudioEncoding_AUDIO_ENCODING_SIGNED,
SampleRate: m.Attributes.SampleRate,
BitDepth: 16,
ByteOrder: voicebio.ByteOrder_BYTE_ORDER_LITTLE_ENDIAN,
Channels: 1,
},
},
},
}
// Open microphone stream using sox's rec command and record
// audio using the config specified above for *10 seconds*.
maxDuration := 10
args := fmt.Sprintf("-t raw -r %d -e signed -b 16 -L -c 1 - trim 0 %d", m.Attributes.SampleRate, maxDuration)
cmd := exec.CommandContext(ctx, "rec", strings.Fields(args)...)
cmd.Stderr = os.Stderr
audio, err := cmd.StdoutPipe()
if err != nil {
fmt.Printf("failed to open microphone stream: %v\n", err)
os.Exit(1)
}
// Starting routines to record from microphone and stream to server
// using an errgroup.Group that returns if either one encounters an error.
eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error {
fmt.Printf("\n[INFO] recording %d seconds from microphone \n", maxDuration)
if err := cmd.Run(); err != nil {
return fmt.Errorf("record from microphone: %w", err)
}
return nil
})
// Starting enrollment.
result, err := StreamingEnroll(ctx, client, cfg, audio)
if err != nil {
fmt.Printf("failed to run streaming enrollment: %v\n", err)
os.Exit(1)
}
if err := eg.Wait(); err != nil {
fmt.Printf("%v\n", err)
os.Exit(1)
}
// A certain minimum duration of speech is required for completing enrollment.
// The enrollment status contains information on Whether that has been met or
// whether additional audio is required.
fmt.Printf("Enrollment Status: %v\n", result.EnrollmentStatus)
// Saving the voiceprint data to a file. This can be provided again
// in another StreamingEnroll request (for continuing enrollment) or
// submitted for verification / identification requests.
if err := os.WriteFile("voiceprint.bin", []byte(result.Voiceprint.Data), os.ModePerm); err != nil {
fmt.Printf("failed to wriet voiceprint data: %v\n", err)
os.Exit(1)
}
}
// StreamingEnroll wraps the streaming API for performing speaker enrollment
// (i.e. voiceprint generation) using the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingEnroll(
ctx context.Context,
client voicebio.VoiceBioServiceClient,
cfg *voicebio.EnrollmentConfig,
audio io.Reader,
) (*voicebio.StreamingEnrollResponse, error) {
const (
streamingBufSize = 1024
)
// Creating stream.
stream, err := client.StreamingEnroll(ctx)
if err != nil {
return nil, err
}
// Sending audio.
if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
// if sendAudio encountered io.EOF, it's only a
// notification that the stream has closed. The actual
// status will be obtained in the CloseAndRecv call. We
// therefore return on non-EOF errors here.
return nil, err
}
// Returning result.
return stream.CloseAndRecv()
}
// sendAudio sends audio to a stream.
func sendAudio(
stream voicebio.VoiceBioService_StreamingEnrollClient,
cfg *voicebio.EnrollmentConfig,
audio io.Reader,
bufSize uint32,
) error {
// The first message needs to be a config message, and all subsequent
// messages must be audio messages.
// Send the config.
if err := stream.Send(&voicebio.StreamingEnrollRequest{
Request: &voicebio.StreamingEnrollRequest_Config{Config: cfg},
}); err != nil {
// if this failed, we don't need to CloseSend
return err
}
// Stream the audio.
buf := make([]byte, bufSize)
for {
n, err := audio.Read(buf)
if n > 0 {
if err2 := stream.Send(&voicebio.StreamingEnrollRequest{
Request: &voicebio.StreamingEnrollRequest_Audio{
Audio: &voicebio.Audio{Data: buf[:n]},
},
}); err2 != nil {
// if we couldn't Send, the stream has
// encountered an error and we don't need to
// CloseSend.
return err2
}
}
if err != nil {
// err could be io.EOF, or some other error reading from
// audio. In any case, we need to CloseSend, send the
// appropriate error to errCh and return from the function
if err2 := stream.CloseSend(); err2 != nil {
return err2
}
if err != io.EOF {
return err
}
return nil
}
}
}Re-enrollment
- Voiceprints can be updated and made more robust by re-enrolling them with
additional audio. This can be easily done by providing previous voiceprint
data in the
EnrollmentConfigalong with additional audio in a newStreamingEnrollrequest.
# Connect to server ...
with open("voiceprint.bin", 'r') as f:
voiceprint = f.read().strip()
cfg = voicebio.EnrollmentConfig(
model_id=modelID,
previous_voiceprint=voicebio.Voiceprint(data=voiceprint),
)
# Send audio to server ...package main
import (
"context"
"errors"
"fmt"
"io"
"os"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)
func main() {
// Connect to server ...
// Reading old voiceprint data.
data, err := os.ReadFile("voiceprint.bin")
if err != nil {
fmt.Printf("\nfailed to read voiceprint data: %v\n", err)
os.Exit(1)
}
cfg := &voicebio.EnrollmentConfig{
ModelId: modelResp.Models[0].Id,
PreviousVoiceprint: &voicebio.Voiceprint{Data: string(data)},
}
// Send audio to server ...
}