Streaming Enrollment

Describes how to stream audio to VoiceBio server for enrollment.

The following example shows how to stream audio using VoiceBio’s StreamingEnroll request and generate a voiceprint. The stream can come from a file on disk or be directly from a microphone in real time.

Streaming from an audio file

We support several headered file formats including WAV, MP3, FLAC etc. For more details, please see the protocol buffer specification here. For best accuracy, it is recommended to use an uncompressed / loss-less compression audio format like WAV or FLAC.
The examples below use a WAV file as input. We will query the server for available models and use the first model to generate the voiceprint.
Generated Voiceprints can be updated and made more robust by re-enrolling them with additional audio. Please see the re-enrollment section.

Python
Go

import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example.
modelID = modelResp.models[0].id

# Set the enrollment config. We don't set the audio format and let the
# server auto-detect the format from the file header.
cfg = voicebio.EnrollmentConfig(
    model_id=modelID,
    previous_voiceprint=None,
)

# The first request to the server should only contain the
# configuration. Subsequent requests should contain audio
# bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
    yield voicebio.StreamingEnrollRequest(config=cfg)
    
    data = audio.read(bufferSize)
    while len(data) > 0:
        yield voicebio.StreamingEnrollRequest(audio=voicebio.Audio(data=data))
        data = audio.read(bufferSize)

# Streaming audio to the server.
with open("test.wav", "rb") as audio:
  result = client.StreamingEnroll(stream(cfg, audio))

# A certain minimum duration of speech is required for completing enrollment.
# The enrollment status contains information on Whether that has been met or
# whether additional audio is required.  
print(f"enrollment Status:\n{result.enrollment_status}\n")

# Saving the voiceprint data to a file. This can be provided again
# in another StreamingEnroll request (for continuing enrollment) or
# submitted for verification / identification requests.
with open("voiceprint.bin", 'w') as f:
  f.write(result.voiceprint.data)

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Selecting the first model.
	cfg := &voicebio.EnrollmentConfig{
		ModelId:            modelResp.Models[0].Id,
		PreviousVoiceprint: nil,
	}

	// Opening audio file.
	audio, err := os.Open("test.wav")
	if err != nil {
		fmt.Printf("failed to open audio file: %v\n", err)
		os.Exit(1)
	}

	defer audio.Close()

	// Starting enrollment.
	result, err := StreamingEnroll(ctx, client, cfg, audio)
	if err != nil {
		fmt.Printf("failed to run streaming enrollment: %v\n", err)
		os.Exit(1)
	}

	// A certain minimum duration of speech is required for completing enrollment.
	// The enrollment status contains information on Whether that has been met or
	// whether additional audio is required.
	fmt.Printf("Enrollment Status: %v\n", result.EnrollmentStatus)

	// Saving the voiceprint data to a file. This can be provided again
	// in another StreamingEnroll request (for continuing enrollment) or
	// submitted for verification / identification requests.
	if err := os.WriteFile("voiceprint.bin", []byte(result.Voiceprint.Data), os.ModePerm); err != nil {
		fmt.Printf("failed to write voiceprint data: %v\n", err)
		os.Exit(1)
	}
}

// StreamingEnroll wraps the streaming API for performing speaker enrollment
// (i.e. voiceprint generation) using  the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingEnroll(
	ctx context.Context,
	client voicebio.VoiceBioServiceClient,
	cfg *voicebio.EnrollmentConfig,
	audio io.Reader,
) (*voicebio.StreamingEnrollResponse, error) {
	const (
		streamingBufSize = 1024
	)

	// Creating stream.
	stream, err := client.StreamingEnroll(ctx)
	if err != nil {
		return nil, err
	}

	// Sending audio.
	if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
		// if sendAudio encountered io.EOF, it's only a
		// notification that the stream has closed.  The actual
		// status will be obtained in the CloseAndRecv call. We
		// therefore return on non-EOF errors here.
		return nil, err
	}

	// Returning result.
	return stream.CloseAndRecv()
}

// sendAudio sends the config and audio to a stream.
func sendAudio(
	stream voicebio.VoiceBioService_StreamingEnrollClient,
	cfg *voicebio.EnrollmentConfig,
	audio io.Reader,
	bufSize uint32,
) error {
	// The first message needs to be a config message, and all subsequent
	// messages must be audio messages.

	// Send the config.
	if err := stream.Send(&voicebio.StreamingEnrollRequest{
		Request: &voicebio.StreamingEnrollRequest_Config{Config: cfg},
	}); err != nil {
		// if this failed, we don't need to CloseSend
		return err
	}

	// Stream the audio.
	buf := make([]byte, bufSize)
	for {
		n, err := audio.Read(buf)
		if n > 0 {
			if err2 := stream.Send(&voicebio.StreamingEnrollRequest{
				Request: &voicebio.StreamingEnrollRequest_Audio{
					Audio: &voicebio.Audio{Data: buf[:n]},
				},
			}); err2 != nil {
				// if we couldn't Send, the stream has
				// encountered an error and we don't need to
				// CloseSend.
				return err2
			}
		}

		if err != nil {
			// err could be io.EOF, or some other error reading from
			// audio.  In any case, we need to CloseSend, send the
			// appropriate error to errCh and return from the function
			if err2 := stream.CloseSend(); err2 != nil {
				return err2
			}

			if err != io.EOF {
				return err
			}

			return nil
		}
	}
}

Streaming from microphone

Streaming audio from microphone input basically requires a reader interface that can provided audio samples recorded from a microphone; typically this requires interaction with system libraries. Another option is to use an external command line tool like sox to record and pipe audio into the client.
The examples below use the latter approach by using the rec command provided with sox to record and stream the audio.

Python
Go

#!/usr/bin/env python3

# This example assumes sox is installed on the system and is available
# in the system's PATH variable. Instead of opening a regular file from
# disk, we open a subprocess that executes sox's rec command to record
# audio from the system's default microphone.

import subprocess
import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example.
m = modelResp.models[0]
modelID = m.id

# Setting audio format to be raw 16-bit signed little endian audio samples
# recorded at the sample rate expected by the model.
cfg = voicebio.EnrollmentConfig(
    model_id=modelID,
    previous_voiceprint=None,
    audio_format=voicebio.AudioFormat(
      audio_format_raw=voicebio.AudioFormatRAW(
        encoding="AUDIO_ENCODING_SIGNED",
        bit_depth=16,
        byte_order="BYTE_ORDER_LITTLE_ENDIAN",
        sample_rate=m.attributes.sample_rate,
        channels=1,
      )
    ),
)

# Open microphone stream using sox's rec command and record
# audio using the config specified above for *10 seconds*.
maxDuration = 10
cmd = f"rec -t raw -r {m.attributes.sample_rate} -e signed -b 16 -L -c 1 - trim 0 {maxDuration}"
mic = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
audio = mic.stdout

try:
  _ = audio.read(1024) # Trying to read some bytes as sanity check.
except Exception as err:
    print(f"[ERROR] failed to read audio from mic stream: {err}")

print(f"\n[INFO] recording {maxDuration} seconds of audio microphone ... \n")

# The first request to the server should only contain the
# recognition configuration. Subsequent requests should contain
# audio bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
    yield voicebio.StreamingEnrollRequest(config=cfg)

    data = audio.read(bufferSize)
    while len(data) > 0:
        yield voicebio.StreamingEnrollRequest(audio=voicebio.Audio(data=data))
        data = audio.read(bufferSize)

# Streaming audio to the server.
result = client.StreamingEnroll(stream(cfg, audio))

# A certain minimum duration of speech is required for completing enrollment.
# The enrollment status contains information on Whether that has been met or
# whether additional audio is required.  
print(f"enrollment Status:\n{result.enrollment_status}\n")

# Saving the voiceprint data to a file. This can be provided again
# in another StreamingEnroll request (for continuing enrollment) or
# submitted for verification / identification requests.
with open("voiceprint.bin", 'w') as f:
  f.write(result.voiceprint.data)

audio.close()
mic.kill()

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"
	"os/exec"
	"strings"

	"golang.org/x/sync/errgroup"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Selecting first model.
	m := modelResp.Models[0]

	// Setting audio format to be raw 16-bit signed little endian audio samples
	// recorded at the sample rate expected by the model.
	cfg := &voicebio.EnrollmentConfig{
		ModelId:            m.Id,
		PreviousVoiceprint: nil,
		AudioFormat: &voicebio.AudioFormat{AudioFormat: &voicebio.AudioFormat_AudioFormatRaw{
			AudioFormatRaw: &voicebio.AudioFormatRAW{
				Encoding:   voicebio.AudioEncoding_AUDIO_ENCODING_SIGNED,
				SampleRate: m.Attributes.SampleRate,
				BitDepth:   16,
				ByteOrder:  voicebio.ByteOrder_BYTE_ORDER_LITTLE_ENDIAN,
				Channels:   1,
			},
		},
		},
	}

	// Open microphone stream using sox's rec command and record
	// audio using the config specified above for *10 seconds*.
	maxDuration := 10
	args := fmt.Sprintf("-t raw -r %d -e signed -b 16 -L -c 1 - trim 0 %d", m.Attributes.SampleRate, maxDuration)
	cmd := exec.CommandContext(ctx, "rec", strings.Fields(args)...)
	cmd.Stderr = os.Stderr

	audio, err := cmd.StdoutPipe()
	if err != nil {
		fmt.Printf("failed to open microphone stream: %v\n", err)
		os.Exit(1)
	}

	// Starting routines to record from microphone and stream to server
	// using an errgroup.Group that returns if either one encounters an error.
	eg, ctx := errgroup.WithContext(ctx)

	eg.Go(func() error {
		fmt.Printf("\n[INFO] recording %d seconds from microphone \n", maxDuration)

		if err := cmd.Run(); err != nil {
			return fmt.Errorf("record from microphone: %w", err)
		}

		return nil
	})

	// Starting enrollment.
	result, err := StreamingEnroll(ctx, client, cfg, audio)
	if err != nil {
		fmt.Printf("failed to run streaming enrollment: %v\n", err)
		os.Exit(1)
	}

	if err := eg.Wait(); err != nil {
		fmt.Printf("%v\n", err)
		os.Exit(1)
	}

	// A certain minimum duration of speech is required for completing enrollment.
	// The enrollment status contains information on Whether that has been met or
	// whether additional audio is required.
	fmt.Printf("Enrollment Status: %v\n", result.EnrollmentStatus)

	// Saving the voiceprint data to a file. This can be provided again
	// in another StreamingEnroll request (for continuing enrollment) or
	// submitted for verification / identification requests.
	if err := os.WriteFile("voiceprint.bin", []byte(result.Voiceprint.Data), os.ModePerm); err != nil {
		fmt.Printf("failed to wriet voiceprint data: %v\n", err)
		os.Exit(1)
	}
}

// StreamingEnroll wraps the streaming API for performing speaker enrollment
// (i.e. voiceprint generation) using  the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingEnroll(
	ctx context.Context,
	client voicebio.VoiceBioServiceClient,
	cfg *voicebio.EnrollmentConfig,
	audio io.Reader,
) (*voicebio.StreamingEnrollResponse, error) {
	const (
		streamingBufSize = 1024
	)

	// Creating stream.
	stream, err := client.StreamingEnroll(ctx)
	if err != nil {
		return nil, err
	}

	// Sending audio.
	if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
		// if sendAudio encountered io.EOF, it's only a
		// notification that the stream has closed.  The actual
		// status will be obtained in the CloseAndRecv call. We
		// therefore return on non-EOF errors here.
		return nil, err
	}

	// Returning result.
	return stream.CloseAndRecv()
}

// sendAudio sends audio to a stream.
func sendAudio(
	stream voicebio.VoiceBioService_StreamingEnrollClient,
	cfg *voicebio.EnrollmentConfig,
	audio io.Reader,
	bufSize uint32,
) error {
	// The first message needs to be a config message, and all subsequent
	// messages must be audio messages.

	// Send the config.
	if err := stream.Send(&voicebio.StreamingEnrollRequest{
		Request: &voicebio.StreamingEnrollRequest_Config{Config: cfg},
	}); err != nil {
		// if this failed, we don't need to CloseSend
		return err
	}

	// Stream the audio.
	buf := make([]byte, bufSize)
	for {
		n, err := audio.Read(buf)
		if n > 0 {
			if err2 := stream.Send(&voicebio.StreamingEnrollRequest{
				Request: &voicebio.StreamingEnrollRequest_Audio{
					Audio: &voicebio.Audio{Data: buf[:n]},
				},
			}); err2 != nil {
				// if we couldn't Send, the stream has
				// encountered an error and we don't need to
				// CloseSend.
				return err2
			}
		}

		if err != nil {
			// err could be io.EOF, or some other error reading from
			// audio.  In any case, we need to CloseSend, send the
			// appropriate error to errCh and return from the function
			if err2 := stream.CloseSend(); err2 != nil {
				return err2
			}

			if err != io.EOF {
				return err
			}

			return nil
		}
	}
}

Re-enrollment

Voiceprints can be updated and made more robust by re-enrolling them with additional audio. This can be easily done by providing previous voiceprint data in the EnrollmentConfig along with additional audio in a new StreamingEnroll request.

Python
Go

# Connect to server ...

with open("voiceprint.bin", 'r') as f:
  voiceprint = f.read().strip()

cfg = voicebio.EnrollmentConfig(
  model_id=modelID,
  previous_voiceprint=voicebio.Voiceprint(data=voiceprint),
)

# Send audio to server ...

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {

	// Connect to server ...

	// Reading old voiceprint data.
	data, err := os.ReadFile("voiceprint.bin")
	if err != nil {
		fmt.Printf("\nfailed to read voiceprint data: %v\n", err)
		os.Exit(1)
	}

	cfg := &voicebio.EnrollmentConfig{
		ModelId:            modelResp.Models[0].Id,
		PreviousVoiceprint: &voicebio.Voiceprint{Data: string(data)},
	}

	// Send audio to server ...
}