Streaming Enrollment

Describes how to stream audio to VoiceBio server for enrollment.
  • The following example shows how to stream audio using VoiceBio’s StreamingEnroll request and generate a voiceprint. The stream can come from a file on disk or be directly from a microphone in real time.

Streaming from an audio file

  • We support several headered file formats including WAV, MP3, FLAC etc. For more details, please see the protocol buffer specification here. For best accuracy, it is recommended to use an uncompressed / loss-less compression audio format like WAV or FLAC.

  • The examples below use a WAV file as input. We will query the server for available models and use the first model to generate the voiceprint.

  • Generated Voiceprints can be updated and made more robust by re-enrolling them with additional audio. Please see the re-enrollment section.

import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example.
modelID = modelResp.models[0].id

# Set the enrollment config. We don't set the audio format and let the
# server auto-detect the format from the file header.
cfg = voicebio.EnrollmentConfig(
    model_id=modelID,
    previous_voiceprint=None,
)

# The first request to the server should only contain the
# configuration. Subsequent requests should contain audio
# bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
    yield voicebio.StreamingEnrollRequest(config=cfg)
    
    data = audio.read(bufferSize)
    while len(data) > 0:
        yield voicebio.StreamingEnrollRequest(audio=voicebio.Audio(data=data))
        data = audio.read(bufferSize)

# Streaming audio to the server.
with open("test.wav", "rb") as audio:
  result = client.StreamingEnroll(stream(cfg, audio))

# A certain minimum duration of speech is required for completing enrollment.
# The enrollment status contains information on Whether that has been met or
# whether additional audio is required.  
print(f"enrollment Status:\n{result.enrollment_status}\n")

# Saving the voiceprint data to a file. This can be provided again
# in another StreamingEnroll request (for continuing enrollment) or
# submitted for verification / identification requests.
with open("voiceprint.bin", 'w') as f:
  f.write(result.voiceprint.data)
package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Selecting the first model.
	cfg := &voicebio.EnrollmentConfig{
		ModelId:            modelResp.Models[0].Id,
		PreviousVoiceprint: nil,
	}

	// Opening audio file.
	audio, err := os.Open("test.wav")
	if err != nil {
		fmt.Printf("failed to open audio file: %v\n", err)
		os.Exit(1)
	}

	defer audio.Close()

	// Starting enrollment.
	result, err := StreamingEnroll(ctx, client, cfg, audio)
	if err != nil {
		fmt.Printf("failed to run streaming enrollment: %v\n", err)
		os.Exit(1)
	}

	// A certain minimum duration of speech is required for completing enrollment.
	// The enrollment status contains information on Whether that has been met or
	// whether additional audio is required.
	fmt.Printf("Enrollment Status: %v\n", result.EnrollmentStatus)

	// Saving the voiceprint data to a file. This can be provided again
	// in another StreamingEnroll request (for continuing enrollment) or
	// submitted for verification / identification requests.
	if err := os.WriteFile("voiceprint.bin", []byte(result.Voiceprint.Data), os.ModePerm); err != nil {
		fmt.Printf("failed to write voiceprint data: %v\n", err)
		os.Exit(1)
	}
}

// StreamingEnroll wraps the streaming API for performing speaker enrollment
// (i.e. voiceprint generation) using  the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingEnroll(
	ctx context.Context,
	client voicebio.VoiceBioServiceClient,
	cfg *voicebio.EnrollmentConfig,
	audio io.Reader,
) (*voicebio.StreamingEnrollResponse, error) {
	const (
		streamingBufSize = 1024
	)

	// Creating stream.
	stream, err := client.StreamingEnroll(ctx)
	if err != nil {
		return nil, err
	}

	// Sending audio.
	if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
		// if sendAudio encountered io.EOF, it's only a
		// notification that the stream has closed.  The actual
		// status will be obtained in the CloseAndRecv call. We
		// therefore return on non-EOF errors here.
		return nil, err
	}

	// Returning result.
	return stream.CloseAndRecv()
}

// sendAudio sends the config and audio to a stream.
func sendAudio(
	stream voicebio.VoiceBioService_StreamingEnrollClient,
	cfg *voicebio.EnrollmentConfig,
	audio io.Reader,
	bufSize uint32,
) error {
	// The first message needs to be a config message, and all subsequent
	// messages must be audio messages.

	// Send the config.
	if err := stream.Send(&voicebio.StreamingEnrollRequest{
		Request: &voicebio.StreamingEnrollRequest_Config{Config: cfg},
	}); err != nil {
		// if this failed, we don't need to CloseSend
		return err
	}

	// Stream the audio.
	buf := make([]byte, bufSize)
	for {
		n, err := audio.Read(buf)
		if n > 0 {
			if err2 := stream.Send(&voicebio.StreamingEnrollRequest{
				Request: &voicebio.StreamingEnrollRequest_Audio{
					Audio: &voicebio.Audio{Data: buf[:n]},
				},
			}); err2 != nil {
				// if we couldn't Send, the stream has
				// encountered an error and we don't need to
				// CloseSend.
				return err2
			}
		}

		if err != nil {
			// err could be io.EOF, or some other error reading from
			// audio.  In any case, we need to CloseSend, send the
			// appropriate error to errCh and return from the function
			if err2 := stream.CloseSend(); err2 != nil {
				return err2
			}

			if err != io.EOF {
				return err
			}

			return nil
		}
	}
}

Streaming from microphone

  • Streaming audio from microphone input basically requires a reader interface that can provided audio samples recorded from a microphone; typically this requires interaction with system libraries. Another option is to use an external command line tool like sox to record and pipe audio into the client.

  • The examples below use the latter approach by using the rec command provided with sox to record and stream the audio.

#!/usr/bin/env python3

# This example assumes sox is installed on the system and is available
# in the system's PATH variable. Instead of opening a regular file from
# disk, we open a subprocess that executes sox's rec command to record
# audio from the system's default microphone.

import subprocess
import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example.
m = modelResp.models[0]
modelID = m.id

# Setting audio format to be raw 16-bit signed little endian audio samples
# recorded at the sample rate expected by the model.
cfg = voicebio.EnrollmentConfig(
    model_id=modelID,
    previous_voiceprint=None,
    audio_format=voicebio.AudioFormat(
      audio_format_raw=voicebio.AudioFormatRAW(
        encoding="AUDIO_ENCODING_SIGNED",
        bit_depth=16,
        byte_order="BYTE_ORDER_LITTLE_ENDIAN",
        sample_rate=m.attributes.sample_rate,
        channels=1,
      )
    ),
)

# Open microphone stream using sox's rec command and record
# audio using the config specified above for *10 seconds*.
maxDuration = 10
cmd = f"rec -t raw -r {m.attributes.sample_rate} -e signed -b 16 -L -c 1 - trim 0 {maxDuration}"
mic = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
audio = mic.stdout

try:
  _ = audio.read(1024) # Trying to read some bytes as sanity check.
except Exception as err:
    print(f"[ERROR] failed to read audio from mic stream: {err}")

print(f"\n[INFO] recording {maxDuration} seconds of audio microphone ... \n")

# The first request to the server should only contain the
# recognition configuration. Subsequent requests should contain
# audio bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
    yield voicebio.StreamingEnrollRequest(config=cfg)

    data = audio.read(bufferSize)
    while len(data) > 0:
        yield voicebio.StreamingEnrollRequest(audio=voicebio.Audio(data=data))
        data = audio.read(bufferSize)

# Streaming audio to the server.
result = client.StreamingEnroll(stream(cfg, audio))

# A certain minimum duration of speech is required for completing enrollment.
# The enrollment status contains information on Whether that has been met or
# whether additional audio is required.  
print(f"enrollment Status:\n{result.enrollment_status}\n")

# Saving the voiceprint data to a file. This can be provided again
# in another StreamingEnroll request (for continuing enrollment) or
# submitted for verification / identification requests.
with open("voiceprint.bin", 'w') as f:
  f.write(result.voiceprint.data)

audio.close()
mic.kill()
package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"
	"os/exec"
	"strings"

	"golang.org/x/sync/errgroup"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Selecting first model.
	m := modelResp.Models[0]

	// Setting audio format to be raw 16-bit signed little endian audio samples
	// recorded at the sample rate expected by the model.
	cfg := &voicebio.EnrollmentConfig{
		ModelId:            m.Id,
		PreviousVoiceprint: nil,
		AudioFormat: &voicebio.AudioFormat{AudioFormat: &voicebio.AudioFormat_AudioFormatRaw{
			AudioFormatRaw: &voicebio.AudioFormatRAW{
				Encoding:   voicebio.AudioEncoding_AUDIO_ENCODING_SIGNED,
				SampleRate: m.Attributes.SampleRate,
				BitDepth:   16,
				ByteOrder:  voicebio.ByteOrder_BYTE_ORDER_LITTLE_ENDIAN,
				Channels:   1,
			},
		},
		},
	}

	// Open microphone stream using sox's rec command and record
	// audio using the config specified above for *10 seconds*.
	maxDuration := 10
	args := fmt.Sprintf("-t raw -r %d -e signed -b 16 -L -c 1 - trim 0 %d", m.Attributes.SampleRate, maxDuration)
	cmd := exec.CommandContext(ctx, "rec", strings.Fields(args)...)
	cmd.Stderr = os.Stderr

	audio, err := cmd.StdoutPipe()
	if err != nil {
		fmt.Printf("failed to open microphone stream: %v\n", err)
		os.Exit(1)
	}

	// Starting routines to record from microphone and stream to server
	// using an errgroup.Group that returns if either one encounters an error.
	eg, ctx := errgroup.WithContext(ctx)

	eg.Go(func() error {
		fmt.Printf("\n[INFO] recording %d seconds from microphone \n", maxDuration)

		if err := cmd.Run(); err != nil {
			return fmt.Errorf("record from microphone: %w", err)
		}

		return nil
	})

	// Starting enrollment.
	result, err := StreamingEnroll(ctx, client, cfg, audio)
	if err != nil {
		fmt.Printf("failed to run streaming enrollment: %v\n", err)
		os.Exit(1)
	}

	if err := eg.Wait(); err != nil {
		fmt.Printf("%v\n", err)
		os.Exit(1)
	}

	// A certain minimum duration of speech is required for completing enrollment.
	// The enrollment status contains information on Whether that has been met or
	// whether additional audio is required.
	fmt.Printf("Enrollment Status: %v\n", result.EnrollmentStatus)

	// Saving the voiceprint data to a file. This can be provided again
	// in another StreamingEnroll request (for continuing enrollment) or
	// submitted for verification / identification requests.
	if err := os.WriteFile("voiceprint.bin", []byte(result.Voiceprint.Data), os.ModePerm); err != nil {
		fmt.Printf("failed to wriet voiceprint data: %v\n", err)
		os.Exit(1)
	}
}

// StreamingEnroll wraps the streaming API for performing speaker enrollment
// (i.e. voiceprint generation) using  the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingEnroll(
	ctx context.Context,
	client voicebio.VoiceBioServiceClient,
	cfg *voicebio.EnrollmentConfig,
	audio io.Reader,
) (*voicebio.StreamingEnrollResponse, error) {
	const (
		streamingBufSize = 1024
	)

	// Creating stream.
	stream, err := client.StreamingEnroll(ctx)
	if err != nil {
		return nil, err
	}

	// Sending audio.
	if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
		// if sendAudio encountered io.EOF, it's only a
		// notification that the stream has closed.  The actual
		// status will be obtained in the CloseAndRecv call. We
		// therefore return on non-EOF errors here.
		return nil, err
	}

	// Returning result.
	return stream.CloseAndRecv()
}

// sendAudio sends audio to a stream.
func sendAudio(
	stream voicebio.VoiceBioService_StreamingEnrollClient,
	cfg *voicebio.EnrollmentConfig,
	audio io.Reader,
	bufSize uint32,
) error {
	// The first message needs to be a config message, and all subsequent
	// messages must be audio messages.

	// Send the config.
	if err := stream.Send(&voicebio.StreamingEnrollRequest{
		Request: &voicebio.StreamingEnrollRequest_Config{Config: cfg},
	}); err != nil {
		// if this failed, we don't need to CloseSend
		return err
	}

	// Stream the audio.
	buf := make([]byte, bufSize)
	for {
		n, err := audio.Read(buf)
		if n > 0 {
			if err2 := stream.Send(&voicebio.StreamingEnrollRequest{
				Request: &voicebio.StreamingEnrollRequest_Audio{
					Audio: &voicebio.Audio{Data: buf[:n]},
				},
			}); err2 != nil {
				// if we couldn't Send, the stream has
				// encountered an error and we don't need to
				// CloseSend.
				return err2
			}
		}

		if err != nil {
			// err could be io.EOF, or some other error reading from
			// audio.  In any case, we need to CloseSend, send the
			// appropriate error to errCh and return from the function
			if err2 := stream.CloseSend(); err2 != nil {
				return err2
			}

			if err != io.EOF {
				return err
			}

			return nil
		}
	}
}

Re-enrollment

  • Voiceprints can be updated and made more robust by re-enrolling them with additional audio. This can be easily done by providing previous voiceprint data in the EnrollmentConfig along with additional audio in a new StreamingEnroll request.
# Connect to server ...

with open("voiceprint.bin", 'r') as f:
  voiceprint = f.read().strip()

cfg = voicebio.EnrollmentConfig(
  model_id=modelID,
  previous_voiceprint=voicebio.Voiceprint(data=voiceprint),
)

# Send audio to server ...
package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {

	// Connect to server ...

	// Reading old voiceprint data.
	data, err := os.ReadFile("voiceprint.bin")
	if err != nil {
		fmt.Printf("\nfailed to read voiceprint data: %v\n", err)
		os.Exit(1)
	}

	cfg := &voicebio.EnrollmentConfig{
		ModelId:            modelResp.Models[0].Id,
		PreviousVoiceprint: &voicebio.Voiceprint{Data: string(data)},
	}

	// Send audio to server ...
}