I'm trying to load the pretrained Wav2Vec 2.0 model (I downloaded the Wav2Vec 2.0 Base - 960 hours model from here). When I try to load it (via model.load_state_dict), I get the error below that I am missing keys and have unexpected keys. Any ideas how I load the model?
import torch
from fairseq.models.wav2vec import Wav2Vec2Model
cp = torch.load('pretrained_models/wav2vec_small_960h.pt', map_location=torch.device('cpu'))
model = Wav2Vec2Model.build_model(cp['args'], task=None)
model.load_state_dict(cp['model'])#, strict=False)
Exception has occurred: RuntimeError
Error(s) in loading state_dict for Wav2Vec2Model:
Missing key(s) in state_dict: "mask_emb", "feature_extractor.conv_layers.0.0.weight", "feature_extractor.conv_layers.0.2.weight", "feature_extractor.conv_layers.0.2.bias", "feature_extractor.conv_layers.1.0.weight", "feature_extractor.conv_layers.2.0.weight", "feature_extractor.conv_layers.3.0.weight", "feature_extractor.conv_layers.4.0.weight", "feature_extractor.conv_layers.5.0.weight", "project_q.weight", "project_q.bias", "encoder.pos_conv.0.bias", "encoder.pos_conv.0.weight_g", "encoder.pos_conv.0.weight_v", "encoder.layers.0.self_attn.k_proj.weight", "encoder.layers.0.self_attn.k_proj.bias", "encoder.layers.0.self_attn.v_proj.weight", "encoder.layers.0.self_attn.v_proj.bias", "encoder.layers.0.self_attn.q_proj.weight", "encoder.layers.0.self_attn.q_proj.bias", "encoder.layers.0.self_attn.out_proj.weight", "encoder.layers.0.self_attn.out_proj.bias", "encoder.layers.0.self_attn_layer_norm.weight", "encoder.layers.0.self_attn_layer_norm.bias", "encoder.layers.0.fc1.weight", "encoder.layers.0.fc1.bias", "encoder.layers.0.fc2.weight", "encoder.layers.0.fc2.bias", "encoder.layers.0.final_layer_norm.weight", "encoder.layers.0.final_layer_norm.bias", "encoder.layers.1.self_attn.k_proj.weight", "encoder.layers.1.self_attn.k_proj.bias", "encoder.layers.1.self_attn.v_proj.weight", "encoder.layers.1.self_attn.v_proj.bias", "encoder.layers.1.self_attn.q_proj.weight", "encoder.layers.1.self_attn.q_proj.bias", "encoder.layers.1.self_attn.out_proj.weight", "encoder.layers.1.self_attn.out_proj.bias", "encoder.layers.1.self_attn_layer_norm.weight", "encoder.layers.1.self_attn_layer_norm.bias", "encoder.layers.1.fc1.weight", "encoder.layers.1.fc1.bias", "encoder.layers.1.fc2.weight", "encoder.layers.1.fc2.bias", "encoder.layers.1.final_layer_norm.weight", "encoder.layers.1.final_layer_norm.bias", "encoder.layers.2.self_attn.k_proj.weight", "encoder.layers.2.self_attn.k_proj.bias", "encoder.layers.2.self_attn.v_proj.weight", "encoder.layers.2.self_attn.v_proj.bias", "encoder.layers.2.self_attn.q_proj.weight", "encoder.layers.2.self_attn.q_proj.bias", "encoder.layers.2.self_attn.out_proj.weight", "encoder.layers.2.self_attn.out_proj.bias", "encoder.layers.2.self_attn_layer_norm.weight", "encoder.layers.2.self_attn_layer_norm.bias", "encoder.layers.2.fc1.weight", "encoder.layers.2.fc1.bias", "encoder.layers.2.fc2.weight", "encoder.layers.2.fc2.bias", "encoder.layers.2.final_layer_norm.weight", "encoder.layers.2.final_layer_norm.bias", "encoder.layers.3.self_attn.k_proj.weight", "encoder.layers.3.self_attn.k_proj.bias", "encoder.layers.3.self_attn.v_proj.weight", "encoder.layers.3.self_attn.v_proj.bias", "encoder.layers.3.self_attn.q_proj.weight", "encoder.layers.3.self_attn.q_proj.bias", "encoder.layers.3.self_attn.out_proj.weight", "encoder.layers.3.self_attn.out_proj.bias", "encoder.layers.3.self_attn_layer_norm.weight", "encoder.layers.3.self_attn_layer_norm.bias", "encoder.layers.3.fc1.weight", "encoder.layers.3.fc1.bias", "encoder.layers.3.fc2.weight", "encoder.layers.3.fc2.bias", "encoder.layers.3.final_layer_norm.weight", "encoder.layers.3.final_layer_norm.bias", "encoder.layers.4.self_attn.k_proj.weight", "encoder.layers.4.self_attn.k_proj.bias", "encoder.layers.4.self_attn.v_proj.weight", "encoder.layers.4.self_attn.v_proj.bias", "encoder.layers.4.self_attn.q_proj.weight", "encoder.layers.4.self_attn.q_proj.bias", "encoder.layers.4.self_attn.out_proj.weight", "encoder.layers.4.self_attn.out_proj.bias", "encoder.layers.4.self_attn_layer_norm.weight", "encoder.layers.4.self_attn_layer_norm.bias", "encoder.layers.4.fc1.weight", "encoder.layers.4.fc1.bias", "encoder.layers.4.fc2.weight", "encoder.layers.4.fc2.bias", "encoder.layers.4.final_layer_norm.weight", "encoder.layers.4.final_layer_norm.bias", "encoder.layers.5.self_attn.k_proj.weight", "encoder.layers.5.self_attn.k_proj.bias", "encoder.layers.5.self_attn.v_proj.weight", "encoder.layers.5.self_attn.v_proj.bias", "encoder.layers.5.self_attn.q_proj.weight", "encoder.layers.5.self_attn.q_proj.bias", "encoder.layers.5.self_attn.out_proj.weight", "encoder.layers.5.self_attn.out_proj.bias", "encoder.layers.5.self_attn_layer_norm.weight", "encoder.layers.5.self_attn_layer_norm.bias", "encoder.layers.5.fc1.weight", "encoder.layers.5.fc1.bias", "encoder.layers.5.fc2.weight", "encoder.layers.5.fc2.bias", "encoder.layers.5.final_layer_norm.weight", "encoder.layers.5.final_layer_norm.bias", "encoder.layer_norm.weight", "encoder.layer_norm.bias", "layer_norm.weight", "layer_norm.bias", "final_proj.weight", "final_proj.bias".
Unexpected key(s) in state_dict: "w2v_encoder.proj.weight", "w2v_encoder.proj.bias", "w2v_encoder.w2v_model.feature_extractor.conv_layers.0.0.weight", "w2v_encoder.w2v_model.feature_extractor.conv_layers.0.2.weight", "w2v_encoder.w2v_model.feature_extractor.conv_layers.0.2.bias", "w2v_encoder.w2v_model.feature_extractor.conv_layers.1.0.weight", "w2v_encoder.w2v_model.feature_extractor.conv_layers.2.0.weight", "w2v_encoder.w2v_model.feature_extractor.conv_layers.3.0.weight", "w2v_encoder.w2v_model.feature_extractor.conv_layers.4.0.weight", "w2v_encoder.w2v_model.feature_extractor.conv_layers.5.0.weight", "w2v_encoder.w2v_model.feature_extractor.conv_layers.6.0.weight", "w2v_encoder.w2v_model.encoder.pos_conv.0.bias", "w2v_encoder.w2v_model.encoder.pos_conv.0.weight_g", "w2v_encoder.w2v_model.encoder.pos_conv.0.weight_v", "w2v_encoder.w2v_model.encoder.layers.0.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.0.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.0.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.0.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.0.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.0.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.0.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.0.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.0.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.0.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.0.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.0.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.0.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.0.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.0.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.0.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.1.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.1.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.1.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.1.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.1.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.1.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.1.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.1.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.1.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.1.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.1.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.1.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.1.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.1.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.1.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.1.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.2.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.2.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.2.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.2.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.2.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.2.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.2.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.2.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.2.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.2.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.2.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.2.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.2.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.2.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.2.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.2.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.3.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.3.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.3.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.3.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.3.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.3.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.3.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.3.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.3.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.3.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.3.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.3.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.3.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.3.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.3.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.3.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.4.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.4.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.4.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.4.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.4.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.4.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.4.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.4.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.4.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.4.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.4.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.4.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.4.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.4.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.4.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.4.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.5.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.5.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.5.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.5.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.5.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.5.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.5.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.5.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.5.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.5.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.5.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.5.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.5.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.5.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.5.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.5.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.6.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.6.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.6.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.6.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.6.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.6.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.6.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.6.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.6.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.6.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.6.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.6.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.6.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.6.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.6.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.6.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.7.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.7.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.7.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.7.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.7.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.7.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.7.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.7.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.7.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.7.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.7.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.7.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.7.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.7.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.7.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.7.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.8.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.8.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.8.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.8.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.8.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.8.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.8.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.8.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.8.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.8.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.8.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.8.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.8.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.8.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.8.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.8.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.9.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.9.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.9.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.9.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.9.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.9.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.9.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.9.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.9.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.9.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.9.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.9.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.9.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.9.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.9.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.9.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.10.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.10.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.10.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.10.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.10.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.10.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.10.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.10.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.10.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.10.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.10.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.10.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.10.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.10.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.10.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.10.final_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.11.self_attn.k_proj.weight", "w2v_encoder.w2v_model.encoder.layers.11.self_attn.k_proj.bias", "w2v_encoder.w2v_model.encoder.layers.11.self_attn.v_proj.weight", "w2v_encoder.w2v_model.encoder.layers.11.self_attn.v_proj.bias", "w2v_encoder.w2v_model.encoder.layers.11.self_attn.q_proj.weight", "w2v_encoder.w2v_model.encoder.layers.11.self_attn.q_proj.bias", "w2v_encoder.w2v_model.encoder.layers.11.self_attn.out_proj.weight", "w2v_encoder.w2v_model.encoder.layers.11.self_attn.out_proj.bias", "w2v_encoder.w2v_model.encoder.layers.11.self_attn_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.11.self_attn_layer_norm.bias", "w2v_encoder.w2v_model.encoder.layers.11.fc1.weight", "w2v_encoder.w2v_model.encoder.layers.11.fc1.bias", "w2v_encoder.w2v_model.encoder.layers.11.fc2.weight", "w2v_encoder.w2v_model.encoder.layers.11.fc2.bias", "w2v_encoder.w2v_model.encoder.layers.11.final_layer_norm.weight", "w2v_encoder.w2v_model.encoder.layers.11.final_layer_norm.bias", "w2v_encoder.w2v_model.layer_norm.weight", "w2v_encoder.w2v_model.layer_norm.bias", "w2v_encoder.w2v_model.post_extract_proj.weight", "w2v_encoder.w2v_model.post_extract_proj.bias", "w2v_encoder.w2v_model.mask_emb", "w2v_encoder.w2v_model.encoder.layer_norm.weight", "w2v_encoder.w2v_model.encoder.layer_norm.bias".
File "/Users/lfratamico/Documents/projects/fairseq/fairseq/models/fairseq_model.py", line 93, in load_state_dict
return super().load_state_dict(new_state_dict, strict)
File "/Users/lfratamico/Documents/projects/hearing_happiness/src/temp.py", line 11, in <module>
model.load_state_dict(cp['model'])
pip, source): pip, via the steps hereSeems like we're a few people experiencing this issue.
@alexeib, I saw that you added the Wav2Vec 2.0 docs and are the main author on the paper - do you know what's happening? :)
finetuned models are a different kind of model that use the pretrained model as a backbone but add a linear proj for ctc (or a decoder for seq2seq). you can import them from wav2vec2_asr.py (e.g. for this model you should load weights into Wav2VecCtc)
I'm still getting an issue with the build_model function:
AttributeError: 'NoneType' object has no attribute 'target_dictionary'
you need to pass a task that has this dictionary in?