Vectorizing Voiceprints

Describes how to convert voiceprints into vector representations using VoiceBio’s VectorizeVoiceprints API.

Voiceprints can also be vectorized using the VectorizeVoiceprints API, which returns a vector representation of each voiceprint that can be used for downstream tasks such as clustering, custom scoring, other machine learning models or even semantic searching in vector databases.
See the API reference for more details.
The following example shows how to use the VectorizeVoiceprints API to vectorize voiceprints. The voiceprints can be loaded from files on disk or obtained from previous enrollment sessions.

Info

Voiceprints provided in VectorizeVoiceprints requests must be generated using the same or compatible model via StreamingEnroll.

Python
Go

import numpy as np

import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example. The model ID should be the same as the one used to
# generate the voiceprints being vectorized.
modelID = modelResp.models[0].id

# Loading voiceprints.
voiceprints = []
for p in ["user1.bin", "user2.bin", "user3.bin"]:
    with open(p, 'r') as f:
        voiceprints.append(voicebio.Voiceprint(data=f.read().strip()))

# Set the vectorization config.
req = voicebio.VectorizeVoiceprintsRequest(
    model_id=modelID,
    voiceprints=voiceprints,
)

# Vectorize voiceprints.
result = client.VectorizeVoiceprints(req)

# The server returns a list of vectorized voiceprints in the same order as the input voiceprints.
#
# In most cases, the vectorized voiceprints can be compared using simple distance metrics such as
# cosine similarity or euclidean distance. This is not guaranteed, however, and depends on the model
# used to generate the voiceprints and vectorize them.

# Example using cosine similarity.
n = len(result.voiceprints)
similarity = np.zeros((n, n), dtype=np.float32)

for i, vi in enumerate(result.voiceprints):
  for j, vj in enumerate(result.voiceprints):
    similarity[i, j] = np.dot(vi.data, vj.data) / (np.linalg.norm(vi.data) * np.linalg.norm(vj.data))

print("Cosine Similarity Matrix:")
print(similarity)

package main

import (
	"context"
	"fmt"
	"os"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Reading voiceprint data.
	voiceprints := make([]*voicebio.Voiceprint, 0)

	for i, p := range []string{"user1.bin", "user2.bin", "user3.bin"} {
		data, err := os.ReadFile(p)
		if err != nil {
			fmt.Printf("\nfailed to read voiceprint[%d] data: %v\n", i, err)
			os.Exit(1)
		}

		voiceprints = append(voiceprints, &voicebio.Voiceprint{Data: string(data)})
	}

	// Selecting the first model. The model ID should be the same as the one used to generate the
	// voiceprints being compared.
	req := &voicebio.VectorizeVoiceprintsRequest{
		ModelId:            modelResp.Models[0].Id,
		Voiceprints:        voiceprints,
	}

	// Vectorize voiceprints.
	result, err := client.VectorizeVoiceprints(ctx, req)
	if err != nil {
		fmt.Printf("failed to vectorize voiceprints: %v\n", err)
		os.Exit(1)
	}

	// The server returns a list of vectorized voiceprints in the same order as the input voiceprints.
	//
	// In almost cases, the vectorized voiceprints can be compared using simple distance metrics such as
	// cosine similarity or euclidean distance. This  is not guaranteed, however, and depends on the model
	// used to generate the voiceprints and vectorize them.

	// Example using cosine similarity.
	n := len(result.Voiceprints)
	similarity := make([][]float32, n)
	for i := range similarity {
		similarity[i] = make([]float32, n)
	}

	for i, vi := range result.Voiceprints {
		for j, vj := range result.Voiceprints {
			dotProduct := float32(0.0)
			normVi := float32(0.0)
			normVj := float32(0.0)

			for k := range vi.Data {
				dotProduct += vi.Data[k] * vj.Data[k]
				normVi += vi.Data[k] * vi.Data[k]
				normVj += vj.Data[k] * vj.Data[k]
			}

            denom := float32(math.Sqrt(float64(normVi)) * math.Sqrt(float64(normVj)))
			similarity[i][j] = dotProduct / denom
		}
	}

	fmt.Printf("Cosine Similarity Matrix:\n")
	for i := range similarity {
		for j := range similarity[i] {
			fmt.Printf("%1.3f ", similarity[i][j])
		}

		fmt.Println()
    }
}