facebookresearch / fairseq

Facebook AI Research Sequence-to-Sequence Toolkit written in Python.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

wav2vec 2.0 inference pipeline

loretoparisi opened this issue · comments

🚀 Feature Request

Provide a simple inference pipeline for the wav2vec 2.0 model.

Motivation

Current inference script examples/speech_recognition/infer.py handles a lot of cases, resulting being extremely complex.

Pitch

A single python script that loads and runs inference with wav2vec 2.0 pre-trained model on a single wav file or on a programmatically loaded waveform signal.

Alternatives

Additional context

This kind of inference pipeline would enable indi researchers to test the model on their audio dataset and and against other models.

If anyone succeeded in making a brief inference, I would appreciate it if you could leave it here.
If I succeed, I will leave the code here.

I Success !!
I'll be wrapping up the code and put it up here!

I did it in Fairseq version 0.9.0.
In fairseq-0.9.0, Wav2vec-2.0 is not supported, So I took it from the fairseq code and applied it.
I hope this will help.

I will improve the code further and send a pull request.
Here is my code.

import os
import math
import sys
import torch
import torch.nn.functional as F
import numpy as np
import itertools as it
import torch.nn as nn
from fairseq import checkpoint_utils, options, tasks, utils
from fairseq.tasks.audio_pretraining import AudioPretrainingTask
from fairseq.data import Dictionary
from fairseq.models import BaseFairseqModel
import soundfile as sf
from wav2letter.decoder import CriterionType
from wav2letter.criterion import CpuViterbiPath, get_data_ptr_as_bytes
import contextlib
import torch
import torch.nn as nn
from fairseq import checkpoint_utils
from fairseq.models import FairseqEncoder
from examples.wav2vec2.tasks.audio_pretraining import Wav2vec2PretrainingTask


def post_process(sentence: str, symbol: str):
    if symbol == "sentencepiece":
        sentence = sentence.replace(" ", "").replace("\u2581", " ").strip()
    elif symbol == 'wordpiece':
        sentence = sentence.replace(" ", "").replace("_", " ").strip()
    elif symbol == 'letter':
        sentence = sentence.replace(" ", "").replace("|", " ").strip()
    elif symbol == "_EOW":
        sentence = sentence.replace(" ", "").replace("_EOW", " ").strip()
    elif symbol is not None and symbol != 'none':
        sentence = (sentence + " ").replace(symbol, "").rstrip()
    return sentence


class Wav2VecEncoder(FairseqEncoder):
    def __init__(self, args, tgt_dict=None):
        self.apply_mask = args.apply_mask

        arg_overrides = {
            "dropout": args.dropout,
            "activation_dropout": args.activation_dropout,
            "dropout_input": args.dropout_input,
            "attention_dropout": args.attention_dropout,
            "mask_length": args.mask_length,
            "mask_prob": args.mask_prob,
            "mask_selection": args.mask_selection,
            "mask_other": args.mask_other,
            "no_mask_overlap": args.no_mask_overlap,
            "mask_channel_length": args.mask_channel_length,
            "mask_channel_prob": args.mask_channel_prob,
            "mask_channel_selection": args.mask_channel_selection,
            "mask_channel_other": args.mask_channel_other,
            "no_mask_channel_overlap": args.no_mask_channel_overlap,
            "encoder_layerdrop": args.layerdrop,
            "feature_grad_mult": args.feature_grad_mult,
        }

        if getattr(args, "w2v_args", None) is None:
            state = checkpoint_utils.load_checkpoint_to_cpu(
                args.w2v_path, arg_overrides
            )
            w2v_args = state["args"]
        else:
            state = None
            w2v_args = args.w2v_args

        assert args.normalize == w2v_args.normalize, 'Fine-tuning works best when data normalization is the same'

        w2v_args.data = args.data
        task = Wav2vec2PretrainingTask.setup_task(w2v_args)
        model = task.build_model(w2v_args)

        if state is not None and not args.no_pretrained_weights:
            model.load_state_dict(state["model"], strict=True)

        model.remove_pretraining_modules()
        super().__init__(task.source_dictionary)

        d = w2v_args.encoder_embed_dim

        self.w2v_model = model

        self.final_dropout = nn.Dropout(args.final_dropout)
        self.freeze_finetune_updates = args.freeze_finetune_updates
        self.num_updates = 0

        if tgt_dict is not None:
            self.proj = Linear(d, len(tgt_dict))
        elif getattr(args, 'decoder_embed_dim', d) != d:
            self.proj = Linear(d, args.decoder_embed_dim)
        else:
            self.proj = None

    def set_num_updates(self, num_updates):
        """Set the number of parameters updates."""
        super().set_num_updates(num_updates)
        self.num_updates = num_updates

    def forward(self, source, padding_mask, tbc=True, **kwargs):

        w2v_args = {
            "source": source,
            "padding_mask": padding_mask,
            "mask": self.apply_mask and self.training,
        }

        ft = self.freeze_finetune_updates <= self.num_updates

        with torch.no_grad() if not ft else contextlib.ExitStack():
            x, padding_mask = self.w2v_model.extract_features(**w2v_args)

            if tbc:
                # B x T x C -> T x B x C
                x = x.transpose(0, 1)

        x = self.final_dropout(x)

        if self.proj:
            x = self.proj(x)

        return {
            "encoder_out": x,  # T x B x C
            "encoder_padding_mask": padding_mask,  # B x T
            "padding_mask": padding_mask,
        }

    def reorder_encoder_out(self, encoder_out, new_order):
        if encoder_out["encoder_out"] is not None:
            encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
                1, new_order
            )
        if encoder_out["encoder_padding_mask"] is not None:
            encoder_out["encoder_padding_mask"] = encoder_out[
                "encoder_padding_mask"
            ].index_select(0, new_order)
        return encoder_out

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        return None

    def upgrade_state_dict_named(self, state_dict, name):
        return state_dict


def Embedding(num_embeddings, embedding_dim, padding_idx):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
    nn.init.constant_(m.weight[padding_idx], 0)
    return m


def Linear(in_features, out_features, bias=True):
    m = nn.Linear(in_features, out_features, bias)
    nn.init.xavier_uniform_(m.weight)
    if bias:
        nn.init.constant_(m.bias, 0.0)
    return m


def base_architecture(args):
    args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False)
    args.dropout_input = getattr(args, "dropout_input", 0)
    args.final_dropout = getattr(args, "final_dropout", 0)
    args.apply_mask = getattr(args, "apply_mask", False)
    args.dropout = getattr(args, "dropout", 0)
    args.attention_dropout = getattr(args, "attention_dropout", 0)
    args.activation_dropout = getattr(args, "activation_dropout", 0)

    args.mask_length = getattr(args, "mask_length", 10)
    args.mask_prob = getattr(args, "mask_prob", 0.5)
    args.mask_selection = getattr(args, "mask_selection", "static")
    args.mask_other = getattr(args, "mask_other", 0)
    args.no_mask_overlap = getattr(args, "no_mask_overlap", False)
    args.mask_channel_length = getattr(args, "mask_channel_length", 10)
    args.mask_channel_prob = getattr(args, "mask_channel_prob", 0.5)
    args.mask_channel_selection = getattr(args, "mask_channel_selection", "static")
    args.mask_channel_other = getattr(args, "mask_channel_other", 0)
    args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False)

    args.freeze_finetune_updates = getattr(args, "freeze_finetune_updates", 0)
    args.feature_grad_mult = getattr(args, "feature_grad_mult", 0)
    args.layerdrop = getattr(args, "layerdrop", 0.0)


class W2lDecoder(object):
    def __init__(self, tgt_dict):
        self.tgt_dict = tgt_dict
        self.vocab_size = len(tgt_dict)
        self.nbest = 1

        self.criterion_type = CriterionType.CTC
        self.blank = (
            tgt_dict.index("<ctc_blank>")
            if "<ctc_blank>" in tgt_dict.indices
            else tgt_dict.bos()
        )
        self.asg_transitions = None

    def generate(self, models, sample, **unused):
        """Generate a batch of inferences."""
        # model.forward normally channels prev_output_tokens into the decoder
        # separately, but SequenceGenerator directly calls model.encoder
        encoder_input = {
            k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
        }
        emissions = self.get_emissions(models, encoder_input)
        return self.decode(emissions)

    def get_emissions(self, models, encoder_input):
        """Run encoder and normalize emissions"""
        # encoder_out = models[0].encoder(**encoder_input)
        encoder_out = models[0](**encoder_input)
        if self.criterion_type == CriterionType.CTC:
            emissions = models[0].get_normalized_probs(encoder_out, log_probs=True)

        return emissions.transpose(0, 1).float().cpu().contiguous()

    def get_tokens(self, idxs):
        """Normalize tokens by handling CTC blank, ASG replabels, etc."""
        idxs = (g[0] for g in it.groupby(idxs))
        idxs = filter(lambda x: x != self.blank, idxs)

        return torch.LongTensor(list(idxs))


class W2lViterbiDecoder(W2lDecoder):
    def __init__(self, tgt_dict):
        super().__init__(tgt_dict)

    def decode(self, emissions):
        B, T, N = emissions.size()
        hypos = list()

        if self.asg_transitions is None:
            transitions = torch.FloatTensor(N, N).zero_()
        else:
            transitions = torch.FloatTensor(self.asg_transitions).view(N, N)

        viterbi_path = torch.IntTensor(B, T)
        workspace = torch.ByteTensor(CpuViterbiPath.get_workspace_size(B, T, N))
        CpuViterbiPath.compute(
            B,
            T,
            N,
            get_data_ptr_as_bytes(emissions),
            get_data_ptr_as_bytes(transitions),
            get_data_ptr_as_bytes(viterbi_path),
            get_data_ptr_as_bytes(workspace),
        )
        return [
            [{"tokens": self.get_tokens(viterbi_path[b].tolist()), "score": 0}] for b in range(B)
        ]


class Wav2VecCtc(BaseFairseqModel):
    @staticmethod
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        add_common_args(parser)

    def __init__(self, w2v_encoder, args):
        super().__init__()
        self.w2v_encoder = w2v_encoder
        self.args = args

    def upgrade_state_dict_named(self, state_dict, name):
        super().upgrade_state_dict_named(state_dict, name)
        return state_dict

    @classmethod
    def build_model(cls, args, target_dict):
        """Build a new model instance."""
        base_architecture(args)
        w2v_encoder = Wav2VecEncoder(args, target_dict)
        return cls(w2v_encoder, args)

    def get_normalized_probs(self, net_output, log_probs):
        """Get normalized probabilities (or log probs) from a net's output."""

        logits = net_output["encoder_out"]
        if log_probs:
            return utils.log_softmax(logits.float(), dim=-1)
        else:
            return utils.softmax(logits.float(), dim=-1)

    def forward(self, **kwargs):
        x = self.w2v_encoder(**kwargs)
        return x


def get_feature(filepath):
    def postprocess(feats, sample_rate):
        if feats.dim == 2:
            feats = feats.mean(-1)

        assert feats.dim() == 1, feats.dim()

        with torch.no_grad():
            feats = F.layer_norm(feats, feats.shape)
        return feats

    wav, sample_rate = sf.read(filepath)
    feats = torch.from_numpy(wav).float()
    feats = postprocess(feats, sample_rate)
    return feats


def load_target_dict(manifest_path='./manifest'):
    dict_path = os.path.join(manifest_path, "dict.ltr.txt")
    target_dict = Dictionary.load(dict_path)
    return target_dict


def load_model(model_path, target_dict):
    # state = checkpoint_utils.load_checkpoint_to_cpu(model_path)
    # args = state["args"]
    w2v = torch.load(model_path)

    # from examples.wav2vec2.models.wav2vec2_asr import Wav2Vec2Model
    model = Wav2VecCtc.build_model(w2v["args"], target_dict)
    model.load_state_dict(w2v["model"], strict=True)

    return [model]


def main():
    sample, input = dict(), dict()
    WAV_PATH = 'xxx.wav'
    W2V_PATH = 'wav2vec2_vox_960h.pt'

    manifest_path = "MANIFEST_PATH"
    feature = get_feature(WAV_PATH )

    use_cuda = torch.cuda.is_available()

    target_dict = load_target_dict(manifest_path)
    model = load_model(W2V_PATH, target_dict)
    model[0].eval()

    generator = W2lViterbiDecoder(target_dict)
    input["source"] = feature.unsqueeze(0)

    padding_mask = torch.BoolTensor(input["source"].size(1)).fill_(False).unsqueeze(0)

    input["padding_mask"] = padding_mask
    sample["net_input"] = input

    with torch.no_grad():
        hypo = generator.generate(model, sample, prefix_tokens=None)

    hyp_pieces = target_dict.string(hypo[0][0]["tokens"].int().cpu())
    print(post_process(hyp_pieces, 'letter'))


if __name__ == '__main__':
    main()
  • Output
I CAME TO THE CONCLUSION THAT WHAT WE NEED IN EDUCATION IS MUCH BETTER UNDERSTANDING EXCLUSIVE AND LEARNING FROM A MOTIVATION OF PERSPECTIVE FROM A PSYCHOLOGICAL REPROSPECTIVE

@sooftware amazing!!! Did you use the latest version of wav2letter?

I don`t sure but I have a command that I used.

# Install python libraries
pip install soundfile
pip install torchaudio
pip install sentencepiece

# Update apt-get & Install soundfile
apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y \
&& apt-get -y install apt-utils gcc libpq-dev libsndfile-dev

# Install kenlm
mkdir external_lib
cd external_lib

sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
git clone https://github.com/kpu/kenlm.git
cd kenlm
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DKENLM_MAX_ORDER=20 -DCMAKE_POSITION_INDEPENDENT_CODE=ON
make -j 16
export KENLM_ROOT_DIR=$ABSOLUTE_PATH'/external_lib/kenlm/'
cd ../..

# Install Additional Dependencies (ATLAS, OpenBLAS, Accelerate, Intel MKL)
apt-get install libsndfile1-dev libopenblas-dev libfftw3-dev libgflags-dev libgoogle-glog-dev

# Install wav2letter
git clone -b v0.2 https://github.com/facebookresearch/wav2letter.git
cd wav2letter/bindings/python
pip install -e .
cd ../../..

I installed wav2letter a few days ago.

@sooftware Thanks! I'm getting an import error for ModuleNotFoundError: No module named 'examples.wav2vec2'.
This module doesn't exist in fairseq though. Did you add it from somewhere else?

@sooftware Could you please specify what does you have inside the file from manifest_path = "MANIFEST_PATH"

Is this path to link

@mironnn The manifest path only contains the dictionary from what I can tell. Look at the load_target_dict function

def load_target_dict(manifest_path='./manifest'):
    dict_path = os.path.join(manifest_path, "dict.ltr.txt")
    target_dict = Dictionary.load(dict_path)
    return target_dict

@sooftware Thanks! I'm getting an import error for ModuleNotFoundError: No module named 'examples.wav2vec2'.
This module doesn't exist in fairseq though. Did you add it from somewhere else?

Have the same issue =(

@kpister I made and used wav2vec2 in the examples folder because I was using it in fairseq-0.9.0.
I'll make code to deduce the latest fairseq! Please wait for a little.

@mironnn

I create pull request (#2668)
I create recognize.py in examples/wav2vec/ directory.
Usage is simple.

  • Command
$ python3 examples/wav2vec/recognize.py --wav_path $WAV_PATH --w2v_path $W2V_PATH --target_dict_path $TARGET_DICT_PATH
  • Output
I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I LOVE THEE PURELY AS THEY TURN FROM PRAISE

Here is the code recognize.py

import torch
import argparse
import soundfile as sf
import torch.nn.functional as F
import itertools as it
from fairseq import utils
from fairseq.models import BaseFairseqModel
from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
from fairseq.data import Dictionary
from fairseq.models.wav2vec.wav2vec2_asr import base_architecture, Wav2VecEncoder
from wav2letter.decoder import CriterionType
from wav2letter.criterion import CpuViterbiPath, get_data_ptr_as_bytes

parser = argparse.ArgumentParser(description='Wav2vec-2.0 Recognize')
parser.add_argument('--wav_path', type=str,
                    default='~/xxx.wav',
                    help='path of wave file')
parser.add_argument('--w2v_path', type=str,
                    default='~/wav2vec2_vox_960h.pt',
                    help='path of pre-trained wav2vec-2.0 model')
parser.add_argument('--target_dict_path', type=str,
                    default='dict.ltr.txt',
                    help='path of target dict (dict.ltr.txt)')


class Wav2VecCtc(BaseFairseqModel):
    def __init__(self, w2v_encoder, args):
        super().__init__()
        self.w2v_encoder = w2v_encoder
        self.args = args

    def upgrade_state_dict_named(self, state_dict, name):
        super().upgrade_state_dict_named(state_dict, name)
        return state_dict

    @classmethod
    def build_model(cls, args, target_dict):
        """Build a new model instance."""
        base_architecture(args)
        w2v_encoder = Wav2VecEncoder(args, target_dict)
        return cls(w2v_encoder, args)

    def get_normalized_probs(self, net_output, log_probs):
        """Get normalized probabilities (or log probs) from a net's output."""
        logits = net_output["encoder_out"]
        if log_probs:
            return utils.log_softmax(logits.float(), dim=-1)
        else:
            return utils.softmax(logits.float(), dim=-1)

    def forward(self, **kwargs):
        x = self.w2v_encoder(**kwargs)
        return x


class W2lDecoder(object):
    def __init__(self, tgt_dict):
        self.tgt_dict = tgt_dict
        self.vocab_size = len(tgt_dict)
        self.nbest = 1

        self.criterion_type = CriterionType.CTC
        self.blank = (
            tgt_dict.index("<ctc_blank>")
            if "<ctc_blank>" in tgt_dict.indices
            else tgt_dict.bos()
        )
        self.asg_transitions = None

    def generate(self, models, sample, **unused):
        """Generate a batch of inferences."""
        # model.forward normally channels prev_output_tokens into the decoder
        # separately, but SequenceGenerator directly calls model.encoder
        encoder_input = {
            k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
        }
        emissions = self.get_emissions(models, encoder_input)
        return self.decode(emissions)

    def get_emissions(self, models, encoder_input):
        """Run encoder and normalize emissions"""
        # encoder_out = models[0].encoder(**encoder_input)
        encoder_out = models[0](**encoder_input)
        if self.criterion_type == CriterionType.CTC:
            emissions = models[0].get_normalized_probs(encoder_out, log_probs=True)

        return emissions.transpose(0, 1).float().cpu().contiguous()

    def get_tokens(self, idxs):
        """Normalize tokens by handling CTC blank, ASG replabels, etc."""
        idxs = (g[0] for g in it.groupby(idxs))
        idxs = filter(lambda x: x != self.blank, idxs)

        return torch.LongTensor(list(idxs))


class W2lViterbiDecoder(W2lDecoder):
    def __init__(self, tgt_dict):
        super().__init__(tgt_dict)

    def decode(self, emissions):
        B, T, N = emissions.size()
        hypos = list()

        if self.asg_transitions is None:
            transitions = torch.FloatTensor(N, N).zero_()
        else:
            transitions = torch.FloatTensor(self.asg_transitions).view(N, N)

        viterbi_path = torch.IntTensor(B, T)
        workspace = torch.ByteTensor(CpuViterbiPath.get_workspace_size(B, T, N))
        CpuViterbiPath.compute(
            B,
            T,
            N,
            get_data_ptr_as_bytes(emissions),
            get_data_ptr_as_bytes(transitions),
            get_data_ptr_as_bytes(viterbi_path),
            get_data_ptr_as_bytes(workspace),
        )
        return [
            [{"tokens": self.get_tokens(viterbi_path[b].tolist()), "score": 0}] for b in range(B)
        ]


def post_process(sentence: str, symbol: str):
    if symbol == "sentencepiece":
        sentence = sentence.replace(" ", "").replace("\u2581", " ").strip()
    elif symbol == 'wordpiece':
        sentence = sentence.replace(" ", "").replace("_", " ").strip()
    elif symbol == 'letter':
        sentence = sentence.replace(" ", "").replace("|", " ").strip()
    elif symbol == "_EOW":
        sentence = sentence.replace(" ", "").replace("_EOW", " ").strip()
    elif symbol is not None and symbol != 'none':
        sentence = (sentence + " ").replace(symbol, "").rstrip()
    return sentence


def get_feature(filepath):
    def postprocess(feats, sample_rate):
        if feats.dim == 2:
            feats = feats.mean(-1)

        assert feats.dim() == 1, feats.dim()

        with torch.no_grad():
            feats = F.layer_norm(feats, feats.shape)
        return feats

    wav, sample_rate = sf.read(filepath)
    feats = torch.from_numpy(wav).float()
    feats = postprocess(feats, sample_rate)
    return feats


def load_model(model_path, target_dict):
    w2v = torch.load(model_path)
    model = Wav2VecCtc.build_model(w2v["args"], target_dict)
    model.load_state_dict(w2v["model"], strict=True)

    return [model]


def main():
    args = parser.parse_args()
    sample = dict()
    net_input = dict()

    feature = get_feature(args.wav_path)
    target_dict = Dictionary.load(args.target_dict_path)

    model = load_model(args.w2v_path, target_dict)
    model[0].eval()

    generator = W2lViterbiDecoder(target_dict)
    net_input["source"] = feature.unsqueeze(0)

    padding_mask = torch.BoolTensor(net_input["source"].size(1)).fill_(False).unsqueeze(0)

    net_input["padding_mask"] = padding_mask
    sample["net_input"] = net_input

    with torch.no_grad():
        hypo = generator.generate(model, sample, prefix_tokens=None)

    hyp_pieces = target_dict.string(hypo[0][0]["tokens"].int().cpu())
    print(post_process(hyp_pieces, 'letter'))


if __name__ == '__main__':
    main()

I don`t sure but I have a command that I used.

# Install python libraries
pip install soundfile
pip install torchaudio
pip install sentencepiece

# Update apt-get & Install soundfile
apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y \
&& apt-get -y install apt-utils gcc libpq-dev libsndfile-dev

# Install kenlm
mkdir external_lib
cd external_lib

sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
git clone https://github.com/kpu/kenlm.git
cd kenlm
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DKENLM_MAX_ORDER=20 -DCMAKE_POSITION_INDEPENDENT_CODE=ON
make -j 16
export KENLM_ROOT_DIR=$ABSOLUTE_PATH'/external_lib/kenlm/'
cd ../..

# Install Additional Dependencies (ATLAS, OpenBLAS, Accelerate, Intel MKL)
apt-get install libsndfile1-dev libopenblas-dev libfftw3-dev libgflags-dev libgoogle-glog-dev

# Install wav2letter
git clone -b v0.2 https://github.com/facebookresearch/wav2letter.git
cd wav2letter/bindings/python
pip install -e .
cd ../../..

@sooftware thanks, I'm trying a CPU build in this case I get a

CMake Error at cmake/CUDAUtils.cmake:12 (message):
      CUDA required to build CUDA criterion backend
    Call Stack (most recent call first):
      src/libraries/criterion/CMakeLists.txt:28 (include)

I can see from your script you build the python bindings, but how to include the -DCRITERION_BACKEND=CPU to disable CUDA?

Oh, I'm sorry. I don't know that issue. T.T

@loretoparisi
I tested CPU case in docker env, and the recognize.py did work.

Here are my processes below:

  1. prepare wav2vec2 required data at fairseq/data, model, dict, wav files:
# For example
fairseq/data/wav2vec_small_960h.pt  # model
fairseq/data/dict.ltr.txt  # dict file
fairseq/data/temp.wav  # the wav you want to test, and don't forget to resample it as 16kHz
  1. prepare recognize.py mentioned above, I put it at fairseq/examples/wav2vec/recognize.py
  2. prepare a dockerfile at fairseq/wav2vec2.CPU.Dockerfile, the build script is:
FROM wav2letter/wav2letter:cpu-latest

ENV USE_CUDA=0
ENV KENLM_ROOT_DIR=/root/kenlm

# will use Intel MKL for featurization but this may cause dynamic loading conflicts.
# ENV USE_MKL=1

ENV LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2018.5.274/linux/mkl/lib/intel64:$LD_IBRARY_PATH
WORKDIR /root/wav2letter/bindings/python

RUN pip install --upgrade pip && pip install soundfile packaging && pip install -e .

WORKDIR /root
RUN git clone https://github.com/pytorch/fairseq.git
RUN mkdir data
COPY examples/wav2vec/recognize.py /root/fairseq/examples/wav2vec/recognize.py

WORKDIR /root/fairseq
RUN pip install --editable ./ && python examples/speech_recognition/infer.py --help && python examples/wav2vec/recognize.py --help
  1. go to fairseq/ dir, then build docker:
# build
docker build -t wav2vec2 -f wav2vec2.CPU.Dockerfile .

# run docker
docker run --rm -itd --ipc=host -v $PWD/data:/root/data --name w2v wav2vec2

# go into container
docker exec -it w2v bash

# run recognize
python examples/wav2vec/recognize.py --wav_path ~/data/temp.wav --w2v_path ~/data/wav2vec_small_960h.pt --target_dict_path ~/data/dict.ltr.txt

@mychiux413 thank you so much. I'm getting this UserWarning

/root/fairseq/examples/speech_recognition/w2l_decoder.py:39: UserWarning: wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings
  "wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings"
usage: recognize.py [-h] [--wav_path WAV_PATH] [--w2v_path W2V_PATH]
                    [--target_dict_path TARGET_DICT_PATH]
recognize.py: error: unrecognized arguments: --wv2_path /app/data/wav2vec_small_10m.pt

Within the container the command used was

python examples/wav2vec/recognize.py --wav_path /root/data/temp.wav --wv2_path /root/data/wav2vec_small_10m.pt --target_dict_path /root/data/dict.ltr.txt

It should not be there, so I have opened an issue.

@loretoparisi there is an typo. not wv2_path, w2v_path. :)

@sooftware gosh!!! I've have checked it ten times!

LoL!! I'm glad I found it now!
@loretoparisi Have you tried evaluating Wav2vec-2.0 model with KenLM or Transformer LM?

@sooftware not yet but this is definitively something I'm are going to do!

Let me know if you succeed! I have an issue (#2654) (with KenLM)
If I succeed, I'll write on the issue.

@sooftware definitively I will. In the meanwhile I have pushed everything here with Docker. I did two Dockerfile. The one suggested by @mychiux413 (👍 thanks) and one edited by me with your commands (👍 thank you too) slightly adapted starting from a stripped down python:3.7.4-slim-buster. They both works, but the docker images have very different sizes:

wav2vec-python3                           latest              cfdcb450b427        51 minutes ago      9.97GB
wav2vec-wav2letter                            latest              e028493c66b0        2 hours ago         3.37GB

Thank you guys for your help and collaboration! I will keep you posted.

Grrrrrrreat !!!
I am studying wav2vec with great interest. It would be nice if we could help each other. :)

        if feats.dim == 2:

@sooftware I guess this a typo, it worked for me when I changed if feats.dim == 2: to if feats.dim() == 2:
I have observed this in @loretoparisi repo's as well.
Anyways, Thanks a ton to both of you for your awesome work!! 👍

        if feats.dim == 2:

@sooftware I guess this a typo, it worked for me when I changed if feats.dim == 2: to if feats.dim() == 2:
I have observed this in @loretoparisi repo's as well.
Anyways, Thanks a ton to both of you for your awesome work!! 👍

Or you just need to convert your audio to mono from stereo. than it would be feats.dim() == 1

btw it should be actually fixed so I will change in my repo get_feature

to

def get_feature(filepath):
    def postprocess(feats, sample_rate):
        if feats.dim() == 2:
            feats = feats.mean(-1)

        assert feats.dim() == 1, feats.dim()

        with torch.no_grad():
            feats = F.layer_norm(feats, feats.shape)
        return feats
        if feats.dim == 2:

@sooftware I guess this a typo, it worked for me when I changed if feats.dim == 2: to if feats.dim() == 2:
I have observed this in @loretoparisi repo's as well.
Anyways, Thanks a ton to both of you for your awesome work!! 👍

Or you just need to convert your audio to mono from stereo. than it would be feats.dim() == 1

Yes I confirm it's a mono/stereo issue. I had an mp3 and tried converting it as:
ffmpeg -i input.mp3 -acodec pcm_s16le -ac 1 -ar 16000 output.wav
and it worked

I succeeded in the installation of wav2letter and fairseq but when running recognize.py:

python examples/wav2vec/recognize.py --wav_path /root/data/audio1.wav --w2v_path /root/data/wav2vec_small_10m.pt --target_dict_path /root/data/dict.ltr.txt

I get the following UserWarning:

`/root/fairseq/examples/speech_recognition/w2l_decoder.py:39: UserWarning: wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings

I used wav2vec.Dockerfile from @loretoparisi repo's for the installation.
The repo structure is:

root/
|──data/
|──fairseq/
|──wav2letter/
|──flashlight/
|──kenlm/

Did you encounter the same warning message?

I encounter the same warning message too. But, It works well. Don`t worry.

@LorenzoGalizia yes I can confirm that there is this UserWarning. I have asked in a separated issue here, but it's not exactly clear which was the underlying cause though.

I am getting the following error on running the python script recognize.py

Traceback (most recent call last):
  File "/home/ubuntu/fairseq/examples/wav2vec/recognize.py", line 10, in <module>
    from wav2letter.decoder import CriterionType
  File "/home/ubuntu/wav2letter/bindings/python/wav2letter/decoder.py", line 3, in <module>
    from wav2letter._decoder import *
ImportError: /home/ubuntu/wav2letter/bindings/python/wav2letter/_decoder.cpython-36m-x86_64-linux-gnu.so: undefined symbol: _ZN2lm5ngram11LoadVirtualEPKcRKNS0_6ConfigENS0_9ModelTypeE

Can someone help?

@RMisha101 If you are using the Docker repository this should not happen.

i am not using the docker repository. Can you tell me what am i doing wrong?

@RMisha101 as far as I can see there is a problem with Cython build of the python bindings - have a look at there flashlight/wav2letter#486.
Btw, I strongly suggest to use the Dockerfile we have provided to avoid these issues.

Hi all! I've just opened an issue regarding the output of the inference done with this script.
Would it be possible to get some sort of character time information? You can find the discussion here.

Hi @loretoparisi @sooftware
Can you guys suggest me a way to add custom vocabulary to pre-trained Wav2vec 2.0 ASR model?

Thanks in advance!

@bharat-patidar What means custom vocabulary?
Pre-train Wav2vec 2.0 model`s vocab dictionary is fixed.
If you want to generate new vocabulary, you have to finetune from pre-trained Wav2vec 2.0 model.

In this case, You just make vocab to this format:

vocab1 frequency1
vocab2 frequency2
vocab3 frequency3
...
...
...

@bharat-patidar What means custom vocabulary?
Pre-train Wav2vec 2.0 model`s vocab dictionary is fixed.
If you want to generate new vocabulary, you have to finetune from pre-trained Wav2vec 2.0 model.

In this case, You just make vocab to this format:

vocab1 frequency1
vocab2 frequency2
vocab3 frequency3
...
...
...

Hi @sooftware ,

Thanks for the response.
By custom vocabulary, I want to identify few custom words, let's say my name, "Bharat" or word like "fairseq" which are not english dictionary words. What changes do we have to make for this requirement?

@bharat-patidar Not that in method. Wav2vec Model inference with character level.
So, Wav2vec Model can inference "Bharat", "fairseq" .

@sooftware @loretoparisi
Can I use my own trained checkpoint file to infer audio files using recognize.py or does this only work for the pre-trained models? If not then how can i modify the script to use my own model?
Thanks!

@Romulan12 You can infer with your trained checkpoint.
I succeed in recognizing my own model.

Hi, I am having issues with the following import:
from examples.speech_recognition import W2lViterbiDecoder
I receive the following error:
No module named 'examples.speech_recognition.utils'
However he does recognize examples and the submodule noisychannel but not speech_recognition.
Anyone who has the same problem and/or knows the solution?

(Also, not sure this is relevant, in the init.py file there is only written import examples.noisychannel but not import examples.speech_recognition)

@Romulan12 You can infer with your trained checkpoint.
I succeed in recognizing my own model.

How were you able to do it? I am unable to. What path do you set for the checkpoint? Can you give me the command for training? Please also give the command for inference using my own checkpoint_best.pt
@loretoparisi @sooftware

Hi all,
In Docker environment,
The warning message can be ignored, if we are NOT using Fairseq Language Model,
The issue was here: wav2letter#775

and the file example/speech_recognition/w2l_decoder.py import the module LexiconFreeDecoder as below( w2l_decoder.py#L35 ):

try:
    from wav2letter.common import create_word_dict, load_words
    from wav2letter.criterion import CpuViterbiPath, get_data_ptr_as_bytes
    from wav2letter.decoder import (
        CriterionType,
        DecoderOptions,
        KenLM,
        LM,
        LMState,
        SmearingMode,
        Trie,
        LexiconDecoder,
        LexiconFreeDecoder,  # ---> wav2letter don't support LexiconFreeDecoder in python bindings right now. 
    )
except:
    warnings.warn(
        "wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings"
    )
    LM = object
    LMState = object

so, with default wav2letter module, the exception must be triggered when import LexiconFreeDecoder, unless we customize the .cpp file.
if we comment that line (w2l_decoder.py#L35) , the warning message will disappear,
but in the other words, if we use fairseqlm as decoder, we must specify the lexicon, or the program will call LexiconFreeDecoder() as decoder ( w2l_decoder.py#L405 ).

@loretoparisi
I tested CPU case in docker env, and the recognize.py did work.

Here are my processes below:

  1. prepare wav2vec2 required data at fairseq/data, model, dict, wav files:
# For example
fairseq/data/wav2vec_small_960h.pt  # model
fairseq/data/dict.ltr.txt  # dict file
fairseq/data/temp.wav  # the wav you want to test, and don't forget to resample it as 16kHz
  1. prepare recognize.py mentioned above, I put it at fairseq/examples/wav2vec/recognize.py
  2. prepare a dockerfile at fairseq/wav2vec2.CPU.Dockerfile, the build script is:
FROM wav2letter/wav2letter:cpu-latest

ENV USE_CUDA=0
ENV KENLM_ROOT_DIR=/root/kenlm

# will use Intel MKL for featurization but this may cause dynamic loading conflicts.
# ENV USE_MKL=1

ENV LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2018.5.274/linux/mkl/lib/intel64:$LD_IBRARY_PATH
WORKDIR /root/wav2letter/bindings/python

RUN pip install --upgrade pip && pip install soundfile packaging && pip install -e .

WORKDIR /root
RUN git clone https://github.com/pytorch/fairseq.git
RUN mkdir data
COPY examples/wav2vec/recognize.py /root/fairseq/examples/wav2vec/recognize.py

WORKDIR /root/fairseq
RUN pip install --editable ./ && python examples/speech_recognition/infer.py --help && python examples/wav2vec/recognize.py --help
  1. go to fairseq/ dir, then build docker:
# build
docker build -t wav2vec2 -f wav2vec2.CPU.Dockerfile .

# run docker
docker run --rm -itd --ipc=host -v $PWD/data:/root/data --name w2v wav2vec2

# go into container
docker exec -it w2v bash

# run recognize
python examples/wav2vec/recognize.py --wav_path ~/data/temp.wav --w2v_path ~/data/wav2vec_small_960h.pt --target_dict_path ~/data/dict.ltr.txt

Hi everyone,
When I try to build the docker file, i get an error: ERROR: File "setup.py" not found. Directory cannot be installed in editable mode: /datadrive/conda-envs/stt/lib/python3.8/site-packages/wav2letter/bindings/python. The file is definitely there, anyone else had this issue? Thanks in advance

@RMisha101 Tell me more specific??
I train the model by README.md command.
After training, I recognize by above code.

@kjellvb
there should have no conda in wav2letter/wav2letter:cpu-latest, and the python version is 3.6,
so I don't know why you got such error message when built the docker file.

I am getting the following error on running the python script recognize.py

Traceback (most recent call last):
  File "/home/ubuntu/fairseq/examples/wav2vec/recognize.py", line 10, in <module>
    from wav2letter.decoder import CriterionType
  File "/home/ubuntu/wav2letter/bindings/python/wav2letter/decoder.py", line 3, in <module>
    from wav2letter._decoder import *
ImportError: /home/ubuntu/wav2letter/bindings/python/wav2letter/_decoder.cpython-36m-x86_64-linux-gnu.so: undefined symbol: _ZN2lm5ngram11LoadVirtualEPKcRKNS0_6ConfigENS0_9ModelTypeE

Can someone help?

I have the same error with you, and after studied it, the answer is, we should call it in different way.
so you should do this:

from wav2letter import decoder
from wav2letter import criterion
from wav2letter import common

CriterionType = decoder.CriterionType
DecoderOptions = decoder.DecoderOptions
KenLM = decoder.KenLM
LexiconDecoder = decoder.LexiconDecoder
SmearingMode = decoder.SmearingMode
Trie = decoder.Trie


CpuViterbiPath = criterion.CpuViterbiPath
get_data_ptr_as_bytes = criterion.get_data_ptr_as_bytes

create_word_dict = common.create_word_dict
load_words = common.load_words
tkn_to_idx = common.tkn_to_idx
from examples.wav2vec2.tasks.audio_pretraining import Wav2vec2PretrainingTask

I believe the examples module come from fairseq folder. But I don't find any clue that by call examples folder as module, we will have wav2vec2 sub module because the examples folder in fairseq, don't have file wav2vec2.py.

can you share the file or share the script inside it?

EDIT:
sorry, so I understood now, you use the wav2vec v1 to predict how wav2vec v2 form...

Well I thinking it's looks like the v2 of wav2vec facebook should do double process, first use this v1 to detect and locate speech, then treat it as NLP vector to calculate the possibility of the similar patern in other speech, so that make it the v2 can be self-trainning in condition without label.

so if you want make the similar thing, you should understanding how fasstext working, and how the v1 working. then the most important is what features in speech that facebook use to guarantee that features exists in other speech, even, with different conditon like noise existence...they can detect it and make a conclution this is similar.

In my opinion, all ASR method this time focus on one representation of speech. what facebook work in v2, they calculate the circumstate in a word speech, in easy way, they analize it like what they do in fasttext. so... if the speech not show a features, example in word "mommy go to the market", word go is not clear..., the NLP method lock or minimize prediction that make raise the posibility to become true if the v2 guess the missing features data is go, base on features data that recognize in words mommy, words to, words the, words market.

so how to make the papeline with the idea like that... hope helps ...cheers

Does anyone got the exact WER from benchmark data like Librispeech?

@SongGeunil1
I use wav2vec_small_960h.pt + kenlm-4-gram.bin (beam size = 1024, lm_weight=2.0) to infer LibriSpeech test-clean, the WER is 2.5849, I also changed the beam size = 5, the WER increased to 2.9118.

I also tried transformer_lm.wmt19.en, which is not training from LibriSpeech corpus, however, the WER is 112.49!!! Don't know how to configure it.

BTW, the viterbi WER is 3.4

@mychiux413 Hi mychiux413!
Can you help me??

I have difficulty decoding with KenLM. #Issue 2734
Can you give me the files (kenlm, lexicon) and command line??

If you help me, it will help me a lot.
My mail address is sh951011@gmail.com

Thank You!

@sooftware
Hi,
The Docker script is: wav2letter.Dockerfile
The wav2vec model is: Wav2Vec 2.0 Base
The kenlm model is: 4-gram ARPA -> to speed up, you should build .bin by your self
The letter dict is: dict.ltr.txt
The lexicon is: librispeech lexicon

My command is as same as yours, and I even tried your model lm_librispeech_kenlm_word_4g_200kvocab.bin and wav2vec2_vox_960h.pt, it still worked, the only difference is I used --cpu to infer the dataset

My command is as below(I mounted all the models under /root/data):

# In Docker environment...
cd ~/fairseq
python examples/speech_recognition/infer.py ~/data/libri --task audio_pretraining \
--nbest 1 --path ~/data/wav2vec2_vox_960h.pt --gen-subset test --results-path ~/data/result-kenlm --w2l-decoder kenlm \
--lm-model ~/data/4-gram.bin --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 \
--post-process letter --cpu --num-workers 1 --batch-size 8 --lexicon ~/data/librispeech_lexicon.lst --beam 1024

In addition, here are the contents reviews 3 lines of my test datasets:

~/data/libri/test.ltr

A S | H E | F L E W | H I S | D O W N | R E A C H I N G | C L U T C H I N G | T A L O N S | W E R E | N O T | H A L F | A | Y A R D | A B O V E | T H E | F U G I T I V E ' S | H E A D |
B U T | H E R E | H E | W A S | A T | A | T E R R I B L E | D I S A D V A N T A G E | A S | C O M P A R E D | W I T H | T H E | O W L S | H A W K S | A N D | E A G L E S | H E | H A D | N O | R E N D I N G | C L A W S |
W H E R E | T H E | W A V E S | F O R | A N | I N S T A N T | S A N K | T H E Y | C A M E | C L O S E R | B U T | N O T | Q U I T E | W I T H I N | G R A S P I N G | R E A C H |

~/data/libri/test.tsv

/root/data/LibriSpeech/
test-clean/7176/88083/7176-88083-0019.flac	92960
test-clean/7176/88083/7176-88083-0003.flac	121600
test-clean/7176/88083/7176-88083-0020.flac	86640

~/data/libri/test.wrd

AS HE FLEW HIS DOWN REACHING CLUTCHING TALONS WERE NOT HALF A YARD ABOVE THE FUGITIVE'S HEAD
BUT HERE HE WAS AT A TERRIBLE DISADVANTAGE AS COMPARED WITH THE OWLS HAWKS AND EAGLES HE HAD NO RENDING CLAWS
WHERE THE WAVES FOR AN INSTANT SANK THEY CAME CLOSER BUT NOT QUITE WITHIN GRASPING REACH

Does anyone got the exact WER from benchmark data like Librispeech?

I am getting very high wer when using kenlm in the range of 80% on a subset of dev-other. This is very weird and I get 12% on the exact same subset when using viterbi

@kjellvb
there should have no conda in wav2letter/wav2letter:cpu-latest, and the python version is 3.6,
so I don't know why you got such error message when built the docker file.

Hi, Thanks for your answer. Could this error be caused by the fact that I am not working in the root, I am running it as sudo user on a server. I tried to solve it by adding sudo commands in the docker file, but I still run into the same error, this is the code I used for the docker file:

FROM wav2letter/wav2letter:cpu-latest

ENV USE_CUDA=0
ENV KENLM_ROOT_DIR=/external_lib/kenlm

will use Intel MKL for featurization but this may cause dynamic loading conflicts.

ENV USE_MKL=1

ENV LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2018.5.274/linux/mkl/lib/intel64:$LD_IBRARY_PATH
WORKDIR /datadrive/conda-envs/stt/lib/python3.8/site-packages/wav2letter/bindings/python

RUN apt-get update &&
apt-get -y install sudo

RUN sudo pip install --upgrade pip && sudo pip install soundfile packaging && sudo pip install -e .

WORKDIR /datadrive/conda-envs/stt/lib/python3.8/site-packages
RUN git clone https://github.com/pytorch/fairseq.git
RUN mkdir data
COPY examples/wav2vec/recognize.py /datadrive/conda-envs/stt/lib/python3.8/site-packages/examples/wav2vec/recognize.py

WORKDIR /datadrive/conda-envs/stt/lib/python3.8/site-packages/fairseq
RUN pip install --editable ./ && python examples/speech_recognition/infer.py --help && python examples/wav2vec/recognize.py --help

Thanks in advance :-)

Hey guys I think it would be cool if we set up a Colab or Kaggle notebook that everyone can just run online. In trying to run the docker image you graciously provided here: https://github.com/loretoparisi/wave2vec-recognize-docker (Thank you Loreto and everyone here! An amazing step forward), I'm running into errors on my system as you can see here: loretoparisi/wave2vec-recognize-docker#2

I think a Colab notebook would avoid all of these issues since we wouldn't need to worry about this working on different computers/OS/environments. (Although I think both being able to run it on a local machine as well as Colab should both be pursued)

Googling how to install docker images on Colab brings up mixed results, but I'll try to give it a shot and if I'm able to make it work, I'll update you. Any help whatsoever would be much appreciated though!

Okay guys, here is a non-working Colab: https://colab.research.google.com/drive/11HfR3coLPU92eiUC7vrQAuQBXVR8PMQb?usp=sharing

What I'm doing for now is taking the main Dockerfile instructions provided by @loretoparisi , and then trying to run those inside of Colab

When running this command:

!cd wav2letter/bindings/python && \
    pip3 install -e .

I get this error unfortunately:

Obtaining file:///content/wav2letter/bindings/python
Installing collected packages: wav2letter
  Running setup.py develop for wav2letter
ERROR: Command errored out with exit status 1: /usr/bin/python3 -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/content/wav2letter/bindings/python/setup.py'"'"'; __file__='"'"'/content/wav2letter/bindings/python/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' develop --no-deps Check the logs for full command output.

I'll try to search for why but any help would again be much appreciated.

@youssefavx let me try to figure out your issue, because I have tested both these Dockerfile and they work. My docker host is macOS. To be sure I do not have any cache, I will clean up and do a fresh docker install. Stay tuned.

@sooftware thx for sharing :))

@SeunghyunSEO You are welcome!! :)) lol!!

@SeunghyunSEO You are welcome!! :)) lol!!

i feel like the way you hustle is rly inspiring me @sooftware , Lets keep that hot 👍

@SeunghyunSEO 👍👍👍 I really want to see you soon. 😄😄

@loretoparisi Thank you so much! Really really appreciate it.

Hey guys. I have succesfully installed fairseq and ran recognize.py on my own batch of .wav files.
However, the loader only seems to work for the Base pretrained model : "wav2vec_small_960h.pt"
When i try to load the large self-supervised model ("wav2vec_vox_960h_pl.pt"), it returns this error :

Traceback (most recent call last):
File "examples/wav2vec/recognize.py", line 198, in
main()
File "examples/wav2vec/recognize.py", line 173, in main
model = load_model(args.w2v_path, target_dict)
File "examples/wav2vec/recognize.py", line 159, in load_model
model = Wav2VecCtc.build_model(w2v["args"], target_dict)
File "examples/wav2vec/recognize.py", line 40, in build_model
w2v_encoder = Wav2VecEncoder(args, target_dict)
File "/mnt/Data/Fairseq/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 330, in init
args.w2v_path, arg_overrides
File "/mnt/Data/Fairseq/fairseq/fairseq/checkpoint_utils.py", line 227, in load_checkpoint_to_cpu
with open(PathManager.get_local_path(path), "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: '/private/home/abaevski/models/wav2vec2/wav2vec_vox_new.pt'

After debugging for a little bit, i can see that for these cases, separate load methods are called in serialization.py.
More specifically :
"wav2vec_small_960h.pt" will load with _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args) ->line 595 in serialization.py
"wav2vec_vox_960h_pl.pt" will load with _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) -> line 594 in serialization.py

@sooftware : Is this planned to be fixed? It seems like the new archived model-checkpoints don't work with the current version of recognize.py and the way it loads everything :(

Also, my running process does not show up in nvidia-smi, although torch.cuda.is_available() == True, so why is it not using the GPU?

@PetreanuAndi That's a very interesting note on how different models are loaded differently. @loretoparisi I wonder if this might be the source of my error. Although I am not getting the same error that @PetreanuAndi is getting. The model I'm loading is wav2vec2_vox_960h.pt although it seems @PetreanuAndi is loading wav2vec 1.0 though I could be wrong.

@PetreanuAndi try the temp solution: #2803 to copy the params from a workable model like wav2vec_small_960h.pt.

@loretoparisi I just tried with the model you tried wav2vec_small_10m.pt and it still gives me the same warnings but it actually didn't say "Killed" this time, it did provide a transcript (after the warnings) but it's a very bad error rate unfortunately. This is what it transcribed:

THE QUESSTION OS HE SENCTOURLY THS HOW IS IT THAT THER IS A TENANCY WITHEN MEMBERS OF OARSIVOLISACTION NAT DD YO

And this is the reference:

The question is essentially this: how is it that there is a tendency within members of our civilization not to use

I wonder why the larger model isn't exporting anything.

Edit: The file I just tried was only 8 seconds long. I now just tried with a 9 minute file, and once again (with the small model) it says "Killed" When I try the 8 second file with the large model it also says "Killed"

@loretoparisi Now I'm wondering if this is a memory allocation problem. When I checked the docker GUI I found that 2 GB was assigned for RAM I believe. I tried increasing this to 6 but in order to make the changes I've had to restart docker.

EDIT: Never mind! I believe I've solved this by simply changing the directory. No need to read further. Now I'm going to test if this really is a memory problem or not.


The trouble is now that I'm able to run the wav2vec2 container but since I know nothing about docker, I have no idea how to link the external data directory back to this wav2vec2 container. Is there some command to do that?

I believe this command:

docker run -d -it --rm -v $PWD/data:/root/data --name w2v2 wav2vec2

Has something to do with it but when I run that in the same folder as before I get this error:

unable to get absolute bin path: stat .: no such file or directory

Would the poor result I got have maybe something to do with a language model? Or something else I wonder?

@loretoparisi 😮 It seems that I might have been onto something with that memory thing. After expanding to 6 GB, I did get a transcription from the large model! (still getting those warnings though)

THE QUESTION IS ESSENTIALLY THIS HOW IS IT THAT THERE IS A TENDENCY WITHIN MEMBERS OF OUR CIVILIZATION NOT TO UE

Now the question is... how to get this working with larger files not only 8 second ones.

Note for people just trying this model: I seem to get awful results (for example it exports "HHA HA AHAH HA" (this is not in the file) with 44kHz audio, it seems to perform well on 16kHz. I'm not sure up to what sample rate the models can go up to.

Okay I just tried with the large model, on a 30 second file, and it did transcribe but again I'm getting these unusual misspellings... I wonder why that is. Anybody have a clue?

ANDI'M GOING TO TRY TO SHOW YOU WHAT KIND OF PROCESS IS INVOLVED YO WELT IF I AM ABLE TO DO THAT TO NIGHT TOGT YOU SHOULD BE ABLE TO GET IT ARE ENOUGH INFORMANTION HERE TO BE ABLE TO ANSWER THOSE QUESTIONS JUST HOW IT IS THAT REREVERT AWAY FROM THEI USEFUL TOLES

Anybody know how to pick/switch the language model temporarily?

@sooftware
Hi,
The Docker script is: wav2letter.Dockerfile
The wav2vec model is: Wav2Vec 2.0 Base
The kenlm model is: 4-gram ARPA -> to speed up, you should build .bin by your self
The letter dict is: dict.ltr.txt
The lexicon is: librispeech lexicon

My command is as same as yours, and I even tried your model lm_librispeech_kenlm_word_4g_200kvocab.bin and wav2vec2_vox_960h.pt, it still worked, the only difference is I used --cpu to infer the dataset

My command is as below(I mounted all the models under /root/data):

# In Docker environment...
cd ~/fairseq
python examples/speech_recognition/infer.py ~/data/libri --task audio_pretraining \
--nbest 1 --path ~/data/wav2vec2_vox_960h.pt --gen-subset test --results-path ~/data/result-kenlm --w2l-decoder kenlm \
--lm-model ~/data/4-gram.bin --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 \
--post-process letter --cpu --num-workers 1 --batch-size 8 --lexicon ~/data/librispeech_lexicon.lst --beam 1024

In addition, here are the contents reviews 3 lines of my test datasets:

~/data/libri/test.ltr

A S | H E | F L E W | H I S | D O W N | R E A C H I N G | C L U T C H I N G | T A L O N S | W E R E | N O T | H A L F | A | Y A R D | A B O V E | T H E | F U G I T I V E ' S | H E A D |
B U T | H E R E | H E | W A S | A T | A | T E R R I B L E | D I S A D V A N T A G E | A S | C O M P A R E D | W I T H | T H E | O W L S | H A W K S | A N D | E A G L E S | H E | H A D | N O | R E N D I N G | C L A W S |
W H E R E | T H E | W A V E S | F O R | A N | I N S T A N T | S A N K | T H E Y | C A M E | C L O S E R | B U T | N O T | Q U I T E | W I T H I N | G R A S P I N G | R E A C H |

~/data/libri/test.tsv

/root/data/LibriSpeech/
test-clean/7176/88083/7176-88083-0019.flac	92960
test-clean/7176/88083/7176-88083-0003.flac	121600
test-clean/7176/88083/7176-88083-0020.flac	86640

~/data/libri/test.wrd

AS HE FLEW HIS DOWN REACHING CLUTCHING TALONS WERE NOT HALF A YARD ABOVE THE FUGITIVE'S HEAD
BUT HERE HE WAS AT A TERRIBLE DISADVANTAGE AS COMPARED WITH THE OWLS HAWKS AND EAGLES HE HAD NO RENDING CLAWS
WHERE THE WAVES FOR AN INSTANT SANK THEY CAME CLOSER BUT NOT QUITE WITHIN GRASPING REACH

Just reading over this thread more thoroughly, so impressed and grateful for your contributions as well @mychiux413 !

I plan on trying your commands here as well and I'm wondering what you mean by this specifically:

The kenlm model is: 4-gram ARPA -> to speed up, you should build .bin by your self

Is there some conversion process from .arpa to .bin or some way to build? I downloaded the file but I'm not exactly sure how to go about making it into a .bin.

@mychiux413 Okay, I think I figured it out (Sorry to bother you!). (Although I haven't gotten the model to do anything yet so I can't be sure)

For anyone here who is curious how to do this (conversion from ARPA to .bin):

  1. Go to this page: https://kheafield.com/code/kenlm/ - see this link: https://kheafield.com/code/kenlm.tar.gz - Download it.

  2. Extract the zipped folder

  3. You'll see a kenlm folder. Follow the instructions in the page for how to 'build' kenlm. The first instruction is mkdir kenlm/build, follow until you complete and it says 'SUCCESS'

  4. Go to this page: https://kheafield.com/code/kenlm/structures/

  5. Inside the kenlm folder, you'll find a folder called 'build' which you just created. Place the .arpa file you downloaded there

  6. cd into this build folder. Follow the instructions in this page: https://kheafield.com/code/kenlm/structures/ for creating a binary file. You have 2 options to go for trie or probing. trie is for saving memory, probing is for speed. My most recommended approach is first to run this command: bin/build_binary 4-gram.arpa (assuming your filename is called 4-gram.arpa), to check the estimates of how much memory each method will use. If you have enough memory, go for probing, if not go for trie. For now, I did both because I'm not sure which one will give me better results. First I did bin/build_binary trie 4-gram.arpa 4-gram_trie.binary then I did bin/build_binary probing 4-gram.arpa 4-gram_probing.binary

  7. You'll get a folder called 4-gram.binary - I changed that to 4-gram.bin

  8. Move the 4-gram.bin folder to the data folder in the docker.

@mychiux413 Now what I'm wondering is how to run the example you gave not on librispeech but on my own wav files.

@mychiux413 Okay I finally got it to test by providing it with the test.ltr and test.wrd and test.tsv files.

However, is it possible to get an inference using the kenlm model (or other models) without requiring reference files? In other words not testing but just doing inference? Is there something I need to change in the command?

Okay with regards to that (doing inference with the command:

# In Docker environment...
cd ~/fairseq
python examples/speech_recognition/infer.py ~/data/libri --task audio_pretraining \
--nbest 1 --path ~/data/wav2vec2_vox_960h.pt --gen-subset test --results-path ~/data/result-kenlm --w2l-decoder kenlm \
--lm-model ~/data/4-gram.bin --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 \
--post-process letter --cpu --num-workers 1 --batch-size 8 --lexicon ~/data/librispeech_lexicon.lst --beam 1024

I think I got an idea which I'll share for those curious: simply generate a tsv file with segmented audio files (of a larger audio file) and their file sizes (on the right column) (making sure to include the directory of the audio files at the very top of the tsv file), and copy the text from the first transcript down in thetest.wrd and test.ltr files. The transcript doesn't have to be correct because we're not actually testing but getting just the inference. Obviously this will result in very poor WER but that can be measured later on if you have your reference in full and a hypothesis.

The transcript could be anything, e.g. for test.ltr:

T H I S | I S

For test.wrd:

THIS IS

And you can just copy that down like so for every file you'll run:

T H I S | I S
T H I S | I S
T H I S | I S
T H I S | I S
THIS IS
THIS IS
THIS IS
THIS IS

Another note: When I try to change the sample rate to 32kHz (by setting --sample-rate to 32000 in the command given by @mychiux413), and give it a 32kHz file, I get this error:

Traceback (most recent call last):
  File "examples/speech_recognition/infer.py", line 471, in <module>
    cli_main()
  File "examples/speech_recognition/infer.py", line 467, in cli_main
    main(args)
  File "examples/speech_recognition/infer.py", line 362, in main
    for sample in t:
  File "/usr/local/lib/python3.6/dist-packages/tqdm/std.py", line 1119, in __iter__
    for obj in iterable:
  File "/root/fairseq/fairseq/data/iterators.py", line 59, in __iter__
    for x in self.iterable:
  File "/root/fairseq/fairseq/data/iterators.py", line 583, in __next__
    raise item
  File "/root/fairseq/fairseq/data/iterators.py", line 514, in run
    for item in self._source:
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 435, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1085, in _next_data
    return self._process_data(data)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1111, in _process_data
    data.reraise()
  File "/usr/local/lib/python3.6/dist-packages/torch/_utils.py", line 428, in reraise
    raise self.exc_type(msg)
Exception: Caught Exception in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/root/fairseq/fairseq/data/add_target_dataset.py", line 38, in __getitem__
    item = self.dataset[index]
  File "/root/fairseq/fairseq/data/audio/raw_audio_dataset.py", line 177, in __getitem__
    feats = self.postprocess(feats, curr_sample_rate)
  File "/root/fairseq/fairseq/data/audio/raw_audio_dataset.py", line 56, in postprocess
    raise Exception(f"sample rate: {curr_sample_rate}, need {self.sample_rate}")
Exception: sample rate: 32000, need 16000

So I'm not sure if this is a limitation of the model or a mistake on my end.

Another note: When I try to change the sample rate to 32kHz (by setting --sample-rate to 32000 in the command given by @mychiux413), and give it a 32kHz file, I get this error:

Traceback (most recent call last):
  File "examples/speech_recognition/infer.py", line 471, in <module>
    cli_main()
  File "examples/speech_recognition/infer.py", line 467, in cli_main
    main(args)
  File "examples/speech_recognition/infer.py", line 362, in main
    for sample in t:
  File "/usr/local/lib/python3.6/dist-packages/tqdm/std.py", line 1119, in __iter__
    for obj in iterable:
  File "/root/fairseq/fairseq/data/iterators.py", line 59, in __iter__
    for x in self.iterable:
  File "/root/fairseq/fairseq/data/iterators.py", line 583, in __next__
    raise item
  File "/root/fairseq/fairseq/data/iterators.py", line 514, in run
    for item in self._source:
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 435, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1085, in _next_data
    return self._process_data(data)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1111, in _process_data
    data.reraise()
  File "/usr/local/lib/python3.6/dist-packages/torch/_utils.py", line 428, in reraise
    raise self.exc_type(msg)
Exception: Caught Exception in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/root/fairseq/fairseq/data/add_target_dataset.py", line 38, in __getitem__
    item = self.dataset[index]
  File "/root/fairseq/fairseq/data/audio/raw_audio_dataset.py", line 177, in __getitem__
    feats = self.postprocess(feats, curr_sample_rate)
  File "/root/fairseq/fairseq/data/audio/raw_audio_dataset.py", line 56, in postprocess
    raise Exception(f"sample rate: {curr_sample_rate}, need {self.sample_rate}")
Exception: sample rate: 32000, need 16000

So I'm not sure if this is a limitation of the model or a mistake on my end.

your freq of your dataset is 32k not 16k. just change it to 16k and don't forget make it 1 channel to.. many module you can use to do that, by googling or search in stackoverflow....like pydub module for example

@wahyubram82 oh sorry I may have not clarified that I’ve tested with 16kHz mono and it works but I’d like to give the model a 32kHz dataset because I’d like to test how well it transcribes at that frequency, so I tried to set the sample-rate parameter to 32kHz. Is the model not able to transcribe 32kHz or anything that’s not 16kHz?

is there anybody implementing wav2vec2.0 with transformer seq2seq decoder??

commented

Hey guys. I have succesfully installed fairseq and ran recognize.py on my own batch of .wav files.
However, the loader only seems to work for the Base pretrained model : "wav2vec_small_960h.pt"
When i try to load the large self-supervised model ("wav2vec_vox_960h_pl.pt"), it returns this error :

Traceback (most recent call last):
File "examples/wav2vec/recognize.py", line 198, in
main()
File "examples/wav2vec/recognize.py", line 173, in main
model = load_model(args.w2v_path, target_dict)
File "examples/wav2vec/recognize.py", line 159, in load_model
model = Wav2VecCtc.build_model(w2v["args"], target_dict)
File "examples/wav2vec/recognize.py", line 40, in build_model
w2v_encoder = Wav2VecEncoder(args, target_dict)
File "/mnt/Data/Fairseq/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 330, in init
args.w2v_path, arg_overrides
File "/mnt/Data/Fairseq/fairseq/fairseq/checkpoint_utils.py", line 227, in load_checkpoint_to_cpu
with open(PathManager.get_local_path(path), "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: '/private/home/abaevski/models/wav2vec2/wav2vec_vox_new.pt'

After debugging for a little bit, i can see that for these cases, separate load methods are called in serialization.py.
More specifically :
"wav2vec_small_960h.pt" will load with _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args) ->line 595 in serialization.py
"wav2vec_vox_960h_pl.pt" will load with _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) -> line 594 in serialization.py

@sooftware : Is this planned to be fixed? It seems like the new archived model-checkpoints don't work with the current version of recognize.py and the way it loads everything :(

Also, my running process does not show up in nvidia-smi, although torch.cuda.is_available() == True, so why is it not using the GPU?

I encountered the same problem when running with wav2vec_vox_960h_pl.pt

Hi @sooftware, thanks for the recognition.py, it's a great script you made there. However, I've some issues. Your script is working fine with finetuned model given by the wav2vec2.0 models but when I'm trying to use my own finetuned model it's throwing error.

Code I ran -->

python3 examples/wav2vec/recognize.py --wav_path /path/audio_file/10min/file/audio_finetune1/file2375.wav --w2v_path /path/audio_file/wav2vec_small_10m.pt --target_dict_path /path/audio_file/manifest/dict.ltr.txt

Error I'm getting-->

Traceback (most recent call last):
File "examples/wav2vec/recognize.py", line 192, in
main()
File "examples/wav2vec/recognize.py", line 173, in main
model = load_model(args.w2v_path, target_dict)
File "examples/wav2vec/recognize.py", line 159, in load_model
model = Wav2VecCtc.build_model(w2v["args"], target_dict)
File "examples/wav2vec/recognize.py", line 39, in build_model
base_architecture(args)
File "/path/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 633, in base_architecture
args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False)
AttributeError: 'NoneType' object has no attribute 'no_pretrained_weights'

Can you please look into it, if any possible mistake I'm doing from my side.

Hey guys. I have succesfully installed fairseq and ran recognize.py on my own batch of .wav files.
However, the loader only seems to work for the Base pretrained model : "wav2vec_small_960h.pt"
When i try to load the large self-supervised model ("wav2vec_vox_960h_pl.pt"), it returns this error :
Traceback (most recent call last):
File "examples/wav2vec/recognize.py", line 198, in
main()
File "examples/wav2vec/recognize.py", line 173, in main
model = load_model(args.w2v_path, target_dict)
File "examples/wav2vec/recognize.py", line 159, in load_model
model = Wav2VecCtc.build_model(w2v["args"], target_dict)
File "examples/wav2vec/recognize.py", line 40, in build_model
w2v_encoder = Wav2VecEncoder(args, target_dict)
File "/mnt/Data/Fairseq/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 330, in init
args.w2v_path, arg_overrides
File "/mnt/Data/Fairseq/fairseq/fairseq/checkpoint_utils.py", line 227, in load_checkpoint_to_cpu
with open(PathManager.get_local_path(path), "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: '/private/home/abaevski/models/wav2vec2/wav2vec_vox_new.pt'
After debugging for a little bit, i can see that for these cases, separate load methods are called in serialization.py.
More specifically :
"wav2vec_small_960h.pt" will load with _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args) ->line 595 in serialization.py
"wav2vec_vox_960h_pl.pt" will load with _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) -> line 594 in serialization.py
@sooftware : Is this planned to be fixed? It seems like the new archived model-checkpoints don't work with the current version of recognize.py and the way it loads everything :(
Also, my running process does not show up in nvidia-smi, although torch.cuda.is_available() == True, so why is it not using the GPU?

I encountered the same problem when running with wav2vec_vox_960h_pl.pt

thats my bad, those models were trained on a version of code that didnt correctly populate "w2v_args". i've uploaded fixed model checkpoints for those models with pseudolabeling. please let me know if it helps!

commented

Hey guys. I have succesfully installed fairseq and ran recognize.py on my own batch of .wav files.
However, the loader only seems to work for the Base pretrained model : "wav2vec_small_960h.pt"
When i try to load the large self-supervised model ("wav2vec_vox_960h_pl.pt"), it returns this error :
Traceback (most recent call last):
File "examples/wav2vec/recognize.py", line 198, in
main()
File "examples/wav2vec/recognize.py", line 173, in main
model = load_model(args.w2v_path, target_dict)
File "examples/wav2vec/recognize.py", line 159, in load_model
model = Wav2VecCtc.build_model(w2v["args"], target_dict)
File "examples/wav2vec/recognize.py", line 40, in build_model
w2v_encoder = Wav2VecEncoder(args, target_dict)
File "/mnt/Data/Fairseq/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 330, in init
args.w2v_path, arg_overrides
File "/mnt/Data/Fairseq/fairseq/fairseq/checkpoint_utils.py", line 227, in load_checkpoint_to_cpu
with open(PathManager.get_local_path(path), "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: '/private/home/abaevski/models/wav2vec2/wav2vec_vox_new.pt'
After debugging for a little bit, i can see that for these cases, separate load methods are called in serialization.py.
More specifically :
"wav2vec_small_960h.pt" will load with _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args) ->line 595 in serialization.py
"wav2vec_vox_960h_pl.pt" will load with _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) -> line 594 in serialization.py
@sooftware : Is this planned to be fixed? It seems like the new archived model-checkpoints don't work with the current version of recognize.py and the way it loads everything :(
Also, my running process does not show up in nvidia-smi, although torch.cuda.is_available() == True, so why is it not using the GPU?

I encountered the same problem when running with wav2vec_vox_960h_pl.pt

thats my bad, those models were trained on a version of code that didnt correctly populate "w2v_args". i've uploaded fixed model checkpoints for those models with pseudolabeling. please let me know if it helps!

@alexeib Re-download the wav2vec_vox_960h_pl.pt and it runs successfully on commit id: 18d3b5c,
But when I run it on the latest code (6815772): an error occurs: issubclass() arg 1 must be a class

@alexeib And now the error changed to: Error merging override model.latent_temp='(2.0,0.1,0.999995)' on e607911

@FrogView I have the same error: hydra.errors.ConfigCompositionException: Error merging override model.latent_temp='(2.0,0.5,0.999995)' on SHA 265791b.

Using b58f4f0 causes a segmentation fault.
Using 18d3b5c gives NameError: name 'CriterionType' is not defined even though I'm specifying --criterion ctc

Hello :)
For those who have the error "hydra.errors.ConfigCompositionException: Error merging override model.latent_temp='(2.0,0.5,0.999995)'", the problem is with the definition of latent_temp in wav2vec2 config. It used to be a string and to be eval'd at execution. The pretrained model checkpoints are defined with the old definition before fairseq uses Hydra.
Now latent_temp have a full type and Hydra checks this. But we can't have backward compatibility and keep both types (correct type and string) since omegaconf, a bakend of Hydra, doesn't handle UNION types.
I have a fix on a fork of fairseq but can't PR it because it would break the checkpoints created after the configuration refactoring: AdrianVandierAst@eb080b0
If anyone has an idea how to patch fairseq correctly, I'ld be glad to implement it.

can someone provide a command to repro this using a published checkpoint? there was some discussion about this latent_temp issue in another thread, but i couldnt repro these crashes with the most recent fairseq code and the published checkpoints

@alexeib I still have the same error on the most recent commit. I built kenlm using the tarball (not via git), compiled with DKENLM_MAX_ORDER 20.

The following command `

python fairseq/examples/speech_recognition/infer.py ./data/manifest --path ./models/w2v2/wav2vec_small_960h.pt --results-path ./results/ --lexicon ./models/w2v2/librispeech_lexicon.lst --w2l-decoder kenlm --lm-model ./models/kenlm/4-gram.bin --task audio_pretraining --nbest 1 --gen-subset dev_other --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 400000 --post-process letter --cpu --num-workers 1 --batch-size 8 --beam 1024

causes the stack trace

INFO:main:| decoding with criterion ctc
INFO:main:| loading model(s) from ./models/w2v2/wav2vec_small_960h.pt
INFO:fairseq.data.audio.raw_audio_dataset:loaded 1, skipped 0 samples
INFO:main:| ./data/manifest dev_other 1 examples
/home/prad/github/wrapASR/fairseq/examples/speech_recognition/w2l_decoder.py:42: UserWarning: wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings
"wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings"
Traceback (most recent call last):
File "fairseq/examples/speech_recognition/infer.py", line 428, in
cli_main()
File "fairseq/examples/speech_recognition/infer.py", line 424, in cli_main
main(args)
File "fairseq/examples/speech_recognition/infer.py", line 284, in main
generator = build_generator(args)
File "fairseq/examples/speech_recognition/infer.py", line 273, in build_generator
return W2lKenLMDecoder(args, task.target_dictionary)
File "/home/prad/github/wrapASR/fairseq/examples/speech_recognition/w2l_decoder.py", line 133, in init
super().init(args, tgt_dict)
File "/home/prad/github/wrapASR/fairseq/examples/speech_recognition/w2l_decoder.py", line 56, in init
self.criterion_type = CriterionType.CTC

looks like it cant import wav2letter. have you tried installing python bindings like the error message is suggesting?

Ah thanks! I don't know how I missed that, must be an issue with my wav2letter install

Returning to this after a while, I just ran my first test and I'm getting surprisingly poor results. I believe my audio file was 16kHz, a lecture that has some noise in it. My WER is: 22.9% and my CER: 14.75%

I used 4-gram (probing), and here is the command I ran:

python examples/speech_recognition/infer.py ~/data/libri --task audio_pretraining --nbest 1 --path ~/data/wav2vec2_vox_960h.pt --gen-subset test --results-path ~/data/result-kenlm --w2l-decoder kenlm --lm-model ~/data/4-gram_probing.bin --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 --post-process letter --cpu --num-workers 1 --batch-size 8 --lexicon ~/data/librispeech_lexicon.lst --beam 1024

Anybody know how to get this down to at the very least 4-5%?

Do I have to use the transformer language model instead?

I am getting this warning:

/root/fairseq/examples/speech_recognition/w2l_decoder.py:41: UserWarning: wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings
  "wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings"

However, it seems to run either way. Could the word error rate be high due to this?

you need to use the fairseq model (.pt) not the wav2letter model (.bin)

@alexeib ah! 🤦‍♂️ Sorry about that, you’re right. I’ll try again soon with the actual language model haha.

Okay, so I downloaded and tried to run it, here's what's going on so far:

I tried running this command:

python examples/speech_recognition/infer.py ~/data/libri --task audio_pretraining --nbest 1 --path ~/data/wav2vec2_vox_960h.pt --gen-subset test --results-path ~/data/result-kenlm --w2l-decoder fairseqlm --lm-model ~/data/lm_librispeech_word_transformer.pt --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 --post-process letter --cpu --num-workers 1 --batch-size 8 --lexicon ~/data/librispeech_lexicon.lst --beam 500

But it gave me this error:

Traceback (most recent call last):
  File "examples/speech_recognition/infer.py", line 471, in <module>
    cli_main()
  File "examples/speech_recognition/infer.py", line 467, in cli_main
    main(args)
  File "examples/speech_recognition/infer.py", line 327, in main
    generator = build_generator(args)
  File "examples/speech_recognition/infer.py", line 320, in build_generator
    return W2lFairseqLMDecoder(args, task.target_dictionary)
  File "/root/fairseq/examples/speech_recognition/w2l_decoder.py", line 354, in __init__
    task = tasks.setup_task(lm_args)
  File "/root/fairseq/fairseq/tasks/__init__.py", line 26, in setup_task
    return TASK_REGISTRY[cfg.task].setup_task(cfg, **kwargs)
  File "/root/fairseq/fairseq/tasks/language_modeling.py", line 158, in setup_task
    dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs)
  File "/root/fairseq/fairseq/tasks/language_modeling.py", line 142, in setup_dictionary
    dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
  File "/root/fairseq/fairseq/data/dictionary.py", line 214, in load
    d.add_from_file(f)
  File "/root/fairseq/fairseq/data/dictionary.py", line 227, in add_from_file
    raise fnfe
  File "/root/fairseq/fairseq/data/dictionary.py", line 224, in add_from_file
    with open(PathManager.get_local_path(f), "r", encoding="utf-8") as fd:
FileNotFoundError: [Errno 2] No such file or directory: '/root/data/dict.txt'

Then I tried taking the dict.ltr.txt file from the libri folder and putting it in the data folder, and renaming it to "dict.txt", and I ran this:

python examples/speech_recognition/infer.py ~/data/libri --task audio_pretraining --nbest 1 --path ~/data/wav2vec2_vox_960h.pt --gen-subset test --results-path ~/data/result-kenlm --w2l-decoder fairseqlm --lm-model ~/data/lm_librispeech_word_transformer.pt --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 --post-process letter --cpu --num-workers 1 --batch-size 8 --lexicon ~/data/librispeech_lexicon.lst --beam 500

I got this error:

INFO:fairseq.tasks.language_modeling:dictionary: 32 types
Traceback (most recent call last):
  File "examples/speech_recognition/infer.py", line 471, in <module>
    cli_main()
  File "examples/speech_recognition/infer.py", line 467, in cli_main
    main(args)
  File "examples/speech_recognition/infer.py", line 327, in main
    generator = build_generator(args)
  File "examples/speech_recognition/infer.py", line 320, in build_generator
    return W2lFairseqLMDecoder(args, task.target_dictionary)
  File "/root/fairseq/examples/speech_recognition/w2l_decoder.py", line 355, in __init__
    model = task.build_model(lm_args)
  File "/root/fairseq/fairseq/tasks/language_modeling.py", line 178, in build_model
    model = super().build_model(args)
  File "/root/fairseq/fairseq/tasks/fairseq_task.py", line 548, in build_model
    model = models.build_model(args, self)
  File "/root/fairseq/fairseq/models/__init__.py", line 56, in build_model
    return ARCH_MODEL_REGISTRY[cfg.arch].build_model(cfg, task)
  File "/root/fairseq/fairseq/models/transformer_lm.py", line 221, in build_model
    args.quant_noise_pq_block_size,
  File "/root/fairseq/fairseq/modules/adaptive_input.py", line 33, in __init__
    ), "cannot specify cutoff larger than vocab size"

Then I tried downloading the fairseq dict file listed next to the transformer model here: https://github.com/facebookresearch/wav2letter/tree/master/recipes/sota/2019

I then renamed the file to 'dict.txt'

I ran this command:

python examples/speech_recognition/infer.py ~/data/libri --task audio_pretraining --nbest 1 --path ~/data/wav2vec2_vox_960h.pt --gen-subset test --results-path ~/data/result-kenlm --w2l-decoder fairseqlm --lm-model ~/data/lm_librispeech_word_transformer.pt --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 --post-process letter --cpu --num-workers 1 --batch-size 8 --lexicon ~/data/librispeech_lexicon.lst --beam 500

And I got this error:

INFO:fairseq.tasks.language_modeling:dictionary: 221456 types
Traceback (most recent call last):
  File "examples/speech_recognition/infer.py", line 471, in <module>
    cli_main()
  File "examples/speech_recognition/infer.py", line 467, in cli_main
    main(args)
  File "examples/speech_recognition/infer.py", line 327, in main
    generator = build_generator(args)
  File "examples/speech_recognition/infer.py", line 320, in build_generator
    return W2lFairseqLMDecoder(args, task.target_dictionary)
  File "/root/fairseq/examples/speech_recognition/w2l_decoder.py", line 362, in __init__
    self.lm = FairseqLM(self.word_dict, model)
  File "/root/fairseq/examples/speech_recognition/w2l_decoder.py", line 223, in __init__
    model.cuda()
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 463, in cuda
    return self._apply(lambda t: t.cuda(device))
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 359, in _apply
    module._apply(fn)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 359, in _apply
    module._apply(fn)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 359, in _apply
    module._apply(fn)
  [Previous line repeated 2 more times]
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 381, in _apply
    param_applied = fn(param)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 463, in <lambda>
    return self._apply(lambda t: t.cuda(device))
  File "/usr/local/lib/python3.6/dist-packages/torch/cuda/__init__.py", line 172, in _lazy_init
    torch._C._cuda_init()
RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

If this error in fact follows from a proper command/sequence of events and not some mistake I made in one of the inputs,

Is there some way to run this on CPU? I already have the --cpu setting in that command.

I believe I have a CUDA driver installed but not sure if I have an NVIDIA one, it seems from my settings that I do but for some reason I guess it's not seeing it?

Screen Shot 2020-12-05 at 3 27 21 PM

There's a whole bunch of problems in this area with Apple/Mac/Nvidia that are hairy to get into. I'd rather just run on CPU.

I want to use Wave2Vec2.0 as a featurizer i.e. get just the embeddings. Can anyone help with this or point to some starting point

@sooftware have you added Language Model decoding in this inference pipeline?