Streaming Verification

Describes how to stream audio to VoiceBio server for verification against a voiceprint.

The following example shows how to stream audio using VoiceBio’s StreamingVerify request and verify whether the audio matches the provided voiceprint. The stream can come from a file on disk or be directly from a microphone in real time.

Streaming from an audio file

We support several headered file formats including WAV, MP3, FLAC etc. For more details, please see the protocol buffer specification here. For best accuracy, it is recommended to use an uncompressed / loss-less compression audio format like WAV or FLAC.
The examples below use a WAV file as input. We will query the server for available models and use the first model to score and verify given audio against a given voiceprint.

Info

Voiceprints provided in StreamingVerify requests must be generated using the same or compatible model via StreamingEnroll.

Python
Go

import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example.
modelID = modelResp.models[0].id

# Loading reference voiceprint.
with open("voiceprint.bin", 'r') as f:
    voiceprint = voicebio.Voiceprint(data=f.read().strip())

# Set the verification config. We don't set the audio format and let the
# server auto-detect the format from the file header.
cfg = voicebio.VerificationConfig(
    model_id=modelID,
    voiceprint=voiceprint,
)

# The first request to the server should only contain the
# configuration. Subsequent requests should contain audio
# bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
    yield voicebio.StreamingVerifyRequest(config=cfg)
    
    data = audio.read(bufferSize)
    while len(data) > 0:
        yield voicebio.StreamingVerifyRequest(audio=voicebio.Audio(data=data))
        data = audio.read(bufferSize)

# Streaming audio to the server.
with open("test.wav", "rb") as audio:
  resp = client.StreamingVerify(stream(cfg, audio))

# Server returns a similarity score along with whether the score
# exceeded the server-configured threshold for being a match.
print(f"Verification Score: {resp.result.similarity_score:1.3f}, Match: {resp.result.is_match}")

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Reading voiceprint data.
	data, err := os.ReadFile("voiceprint.bin")
	if err != nil {
		fmt.Printf("\nfailed to read voiceprint data: %v\n", err)
		os.Exit(1)
	}

	// Selecting the first model.
	cfg := &voicebio.VerificationConfig{
		ModelId:    modelResp.Models[0].Id,
		Voiceprint: &voicebio.Voiceprint{Data: string(data)},
	}

	// Opening audio file.
	audio, err := os.Open("test.wav")
	if err != nil {
		fmt.Printf("failed to open audio file: %v\n", err)
		os.Exit(1)
	}

	defer audio.Close()

	// Starting verification.
	resp, err := StreamingVerify(ctx, client, cfg, audio)
	if err != nil {
		fmt.Printf("failed to run streaming verification: %v\n", err)
		os.Exit(1)
	}

	// Server returns a similarity score along with whether the score
	// exceeded the server-configured threshold for being a match.
	fmt.Printf("Verification Score: %1.3f, Match: %v\n", resp.Result.SimilarityScore, resp.Result.IsMatch)
}

// StreamingVerify wraps the streaming API for performing speaker verification
// using  the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingVerify(
	ctx context.Context,
	client voicebio.VoiceBioServiceClient,
	cfg *voicebio.VerificationConfig,
	audio io.Reader,
) (*voicebio.StreamingVerifyResponse, error) {
	const (
		streamingBufSize = 1024
	)

	// Creating stream.
	stream, err := client.StreamingVerify(ctx)
	if err != nil {
		return nil, err
	}

	// Sending audio.
	if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
		// if sendAudio encountered io.EOF, it's only a
		// notification that the stream has closed.  The actual
		// status will be obtained in the CloseAndRecv call. We
		// therefore return on non-EOF errors here.
		return nil, err
	}

	// Returning result.
	return stream.CloseAndRecv()
}

// sendAudio sends the config and audio to a stream.
func sendAudio(
	stream voicebio.VoiceBioService_StreamingVerifyClient,
	cfg *voicebio.VerificationConfig,
	audio io.Reader,
	bufSize uint32,
) error {
	// The first message needs to be a config message, and all subsequent
	// messages must be audio messages.

	// Send the config.
	if err := stream.Send(&voicebio.StreamingVerifyRequest{
		Request: &voicebio.StreamingVerifyRequest_Config{Config: cfg},
	}); err != nil {
		// if this failed, we don't need to CloseSend
		return err
	}

	// Stream the audio.
	buf := make([]byte, bufSize)
	for {
		n, err := audio.Read(buf)
		if n > 0 {
			if err2 := stream.Send(&voicebio.StreamingVerifyRequest{
				Request: &voicebio.StreamingVerifyRequest_Audio{
					Audio: &voicebio.Audio{Data: buf[:n]},
				},
			}); err2 != nil {
				// if we couldn't Send, the stream has
				// encountered an error and we don't need to
				// CloseSend.
				return err2
			}
		}

		if err != nil {
			// err could be io.EOF, or some other error reading from
			// audio.  In any case, we need to CloseSend, send the
			// appropriate error to errCh and return from the function
			if err2 := stream.CloseSend(); err2 != nil {
				return err2
			}

			if err != io.EOF {
				return err
			}

			return nil
		}
	}
}

Streaming from microphone

Streaming audio from microphone input basically requires a reader interface that can provided audio samples recorded from a microphone; typically this requires interaction with system libraries. Another option is to use an external command line tool like sox to record and pipe audio into the client.
The examples below use the latter approach by using the rec command provided with sox to record and stream the audio.

Python
Go

#!/usr/bin/env python3

# This example assumes sox is installed on the system and is available
# in the system's PATH variable. Instead of opening a regular file from
# disk, we open a subprocess that executes sox's rec command to record
# audio from the system's default microphone.

import subprocess
import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example.
m = modelResp.models[0]
modelID = m.id

# Loading reference voiceprint.
with open("voiceprint.bin", 'r') as f:
    voiceprint = voicebio.Voiceprint(data=f.read().strip())

# Setting audio format to be raw 16-bit signed little endian audio samples
# recorded at the sample rate expected by the model.
cfg = voicebio.VerificationConfig(
    model_id=modelID,
    voiceprint=voiceprint,
    audio_format=voicebio.AudioFormat(
      audio_format_raw=voicebio.AudioFormatRAW(
        encoding="AUDIO_ENCODING_SIGNED",
        bit_depth=16,
        byte_order="BYTE_ORDER_LITTLE_ENDIAN",
        sample_rate=m.attributes.sample_rate,
        channels=1,
      )
    ),
)

# Open microphone stream using sox's rec command and record
# audio using the config specified above for *10 seconds*.
maxDuration = 10
cmd = f"rec -t raw -r {m.attributes.sample_rate} -e signed -b 16 -L -c 1 - trim 0 {maxDuration}"
mic = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
audio = mic.stdout

try:
  _ = audio.read(1024) # Trying to read some bytes as sanity check.
except Exception as err:
    print(f"[ERROR] failed to read audio from mic stream: {err}")

print(f"\n[INFO] recording {maxDuration} seconds of audio microphone ... \n")

# The first request to the server should only contain the
# recognition configuration. Subsequent requests should contain
# audio bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
    yield voicebio.StreamingVerifyRequest(config=cfg)

    data = audio.read(bufferSize)
    while len(data) > 0:
        yield voicebio.StreamingVerifyRequest(audio=voicebio.Audio(data=data))
        data = audio.read(bufferSize)

# Streaming audio to the server.
resp = client.StreamingVerify(stream(cfg, audio))

# Server returns a similarity score along with whether the score
# exceeded the server-configured threshold for being a match.
print(f"Verification Score: {resp.result.similarity_score:1.3f}, Match: {resp.result.is_match}")

audio.close()
mic.kill()

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"
	"os/exec"
	"strings"

	"golang.org/x/sync/errgroup"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Selecting first model.
	m := modelResp.Models[0]

	// Reading voiceprint data.
	data, err := os.ReadFile("voiceprint.bin")
	if err != nil {
		fmt.Printf("\nfailed to read voiceprint data: %v\n", err)
		os.Exit(1)
	}

	// Setting audio format to be raw 16-bit signed little endian audio samples
	// recorded at the sample rate expected by the model.
	cfg := &voicebio.VerificationConfig{
		ModelId:    m.Id,
		Voiceprint: &voicebio.Voiceprint{Data: string(data)},
		AudioFormat: &voicebio.AudioFormat{AudioFormat: &voicebio.AudioFormat_AudioFormatRaw{
			AudioFormatRaw: &voicebio.AudioFormatRAW{
				Encoding:   voicebio.AudioEncoding_AUDIO_ENCODING_SIGNED,
				SampleRate: m.Attributes.SampleRate,
				BitDepth:   16,
				ByteOrder:  voicebio.ByteOrder_BYTE_ORDER_LITTLE_ENDIAN,
				Channels:   1,
			},
		},
		},
	}

	// Open microphone stream using sox's rec command and record
	// audio using the config specified above for *10 seconds*.
	maxDuration := 10
	args := fmt.Sprintf("-t raw -r %d -e signed -b 16 -L -c 1 - trim 0 %d", m.Attributes.SampleRate, maxDuration)
	cmd := exec.CommandContext(ctx, "rec", strings.Fields(args)...)
	cmd.Stderr = os.Stderr

	audio, err := cmd.StdoutPipe()
	if err != nil {
		fmt.Printf("failed to open microphone stream: %v\n", err)
		os.Exit(1)
	}

	// Starting routines to record from microphone and stream to server
	// using an errgroup.Group that returns if either one encounters an error.
	eg, ctx := errgroup.WithContext(ctx)

	eg.Go(func() error {
		fmt.Printf("\n[INFO] recording %d seconds from microphone \n", maxDuration)

		if err := cmd.Run(); err != nil {
			return fmt.Errorf("record from microphone: %w", err)
		}

		return nil
	})

	// Starting verification.
	resp, err := StreamingVerify(ctx, client, cfg, audio)
	if err != nil {
		fmt.Printf("failed to run streaming verification: %v\n", err)
		os.Exit(1)
	}

	// Server returns a similarity score along with whether the score
	// exceeded the server-configured threshold for being a match.
	fmt.Printf("Verification Score: %1.3f, Match: %v\n", resp.Result.SimilarityScore, resp.Result.IsMatch)
}

// StreamingVerify wraps the streaming API for performing speaker verification
// using  the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingVerify(
	ctx context.Context,
	client voicebio.VoiceBioServiceClient,
	cfg *voicebio.VerificationConfig,
	audio io.Reader,
) (*voicebio.StreamingVerifyResponse, error) {
	const (
		streamingBufSize = 1024
	)

	// Creating stream.
	stream, err := client.StreamingVerify(ctx)
	if err != nil {
		return nil, err
	}

	// Sending audio.
	if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
		// if sendAudio encountered io.EOF, it's only a
		// notification that the stream has closed.  The actual
		// status will be obtained in the CloseAndRecv call. We
		// therefore return on non-EOF errors here.
		return nil, err
	}

	// Returning result.
	return stream.CloseAndRecv()
}

// sendAudio sends the config and audio to a stream.
func sendAudio(
	stream voicebio.VoiceBioService_StreamingVerifyClient,
	cfg *voicebio.VerificationConfig,
	audio io.Reader,
	bufSize uint32,
) error {
	// The first message needs to be a config message, and all subsequent
	// messages must be audio messages.

	// Send the config.
	if err := stream.Send(&voicebio.StreamingVerifyRequest{
		Request: &voicebio.StreamingVerifyRequest_Config{Config: cfg},
	}); err != nil {
		// if this failed, we don't need to CloseSend
		return err
	}

	// Stream the audio.
	buf := make([]byte, bufSize)
	for {
		n, err := audio.Read(buf)
		if n > 0 {
			if err2 := stream.Send(&voicebio.StreamingVerifyRequest{
				Request: &voicebio.StreamingVerifyRequest_Audio{
					Audio: &voicebio.Audio{Data: buf[:n]},
				},
			}); err2 != nil {
				// if we couldn't Send, the stream has
				// encountered an error and we don't need to
				// CloseSend.
				return err2
			}
		}

		if err != nil {
			// err could be io.EOF, or some other error reading from
			// audio.  In any case, we need to CloseSend, send the
			// appropriate error to errCh and return from the function
			if err2 := stream.CloseSend(); err2 != nil {
				return err2
			}

			if err != io.EOF {
				return err
			}

			return nil
		}
	}
}