遇到的问题: 先提取MFCC特征,再使用CNN网络训练转换后的tflite模型在移动端识别率很低究其原因是训练样本和测试样本在提取MFCC时没有完全对齐。因此本文采取自定义keras的MFCC层进行特征提取即网络模型的输入为长度统一的音频数据在网络内部通过自定义的MFCC层来提取MFCC特征。如此使用训练好的模型测试时不需要考虑MFCC的特征提取与对齐直接喂入跟训练时长度一致的音频数据即可。1、加载数据并划分训练集、验证集-- read_data.pyimport os import librosa import numpy as np from sklearn.model_selection import train_test_split FIXED_SAMPLE_RATE 16000 # 统一采样率16000Hz语音标准 MAX_LEN 36000 # 采样点 N_MFCC 13 # 特征数,通常取13维 # 加载数据集(直接存音频数据,不提取特征,用于CNN网络模型训练及tflite格式转换) # X_2d:(samples, n_mfcc_feature), y_1d:(samples,) def load_data4cnn(data_pt): X, y [], [] labels os.listdir(data_pt) f open(result/label.txt,w) for label in labels: folder os.path.join(data_pt, label) f.write(str(label) label\n) for fname in os.listdir(folder): fpath os.path.join(folder, fname) y_audio, sr librosa.load(fpath, srFIXED_SAMPLE_RATE) # 统一长度 if len(y_audio) MAX_LEN: y_audio y_audio[:MAX_LEN] else: y_audio np.pad(y_audio, (0, MAX_LEN - len(y_audio))) X.append(y_audio) y.append(int(label)) X np.array(X) y np.array(y) X_train, X_test, y_train, y_test train_test_split(X, y) print(训练样本及标签:,X_train.shape,y_train.shape) print(测试样本及标签:,X_test.shape,y_test.shape) return X_train, X_test, y_train, y_test if __name__ __main__: load_data4cnn(data_ptdata4c)2、自定义MFCC层 --MFCC.pyimport tensorflow as tf from keras.saving import register_keras_serializable # from tensorflow.keras.utils import register_keras_serializable register_keras_serializable() class MFCCLayer(tf.keras.layers.Layer): def __init__(self, **kwargs): super().__init__(**kwargs) def call(self, audio): stft tf.signal.stft( audio, frame_length2048, frame_step512, fft_length2048, window_fntf.signal.hann_window ) spectrogram tf.abs(stft) ** 2 mel_matrix tf.signal.linear_to_mel_weight_matrix( num_mel_bins128, num_spectrogram_bins1025, sample_rate16000, lower_edge_hertz0, upper_edge_hertz8000 ) mel tf.matmul(spectrogram, mel_matrix) log_mel tf.math.log(mel 1e-6) mfcc tf.signal.mfccs_from_log_mel_spectrograms(log_mel) return mfcc[:, :, :13]3、训练CNN模型如果使用GPU训练添加init_gpu.py否则忽略下面的代码import os import tensorflow as tf from keras import backend as K def init_gpu(): # 隐藏 TensorFlow的详细日志 os.environ[TF_CPP_MIN_LOG_LEVEL] 0 # 0全部, 1INFO, 2WARNING, 3ERROR # GPU 优化配置 os.environ[TF_FORCE_GPU_ALLOW_GROWTH] true os.environ[TF_CPP_MIN_LOG_LEVEL] 2 gpus tf.config.list_physical_devices(GPU) print(gpus) if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) print(f✓ GPU 配置成功使用 {len(gpus)} 个 GPU) except RuntimeError as e: print(fGPU 配置失败: {e}) # 在训练前清理 session释放 GPU 内存 K.clear_session()CNN模型训练import numpy as np from keras.models import Model from keras.layers import Input, Conv1D, MaxPooling1D, GlobalAvgPool1D, Dense, Dropout, BatchNormalization from matplotlib import pyplot as plt from read_data import load_data4cnn, MAX_LEN from init_gpu import init_gpu from MFCC import MFCCLayer # 构建 CNN 模型 def cnn_model(num_classes): inputs Input(shape(MAX_LEN,), nameaudio_input) x MFCCLayer()(inputs) x Conv1D(512, 3, paddingsame, activationrelu)(x) x MaxPooling1D(2)(x) x Dropout(0.3)(x) x Conv1D(64, 3, paddingsame, activationrelu)(x) x GlobalAvgPool1D()(x) x Dropout(0.35)(x) x Dense(128, activationrelu)(x) x Dropout(0.3)(x) x Dense(64, activationrelu)(x) x Dropout(0.35)(x) outputs Dense(num_classes, activationsoftmax)(x) return Model(inputs, outputs) def train_model(epochs50): X_train, X_test, y_train, y_test load_data4cnn(data4c) model cnn_model(num_classeslen(np.unique(y_train))) print(model.summary()) model.compile( optimizeradam, losssparse_categorical_crossentropy, metrics[accuracy]) history model.fit(X_train, y_train, validation_data(X_test, y_test), epochs epochs, batch_size32) loss, acc model.evaluate(X_test, y_test) print(f\nCNN 模型测试准确率{acc * 100:.2f}%) # 保存 Keras 模型 model.save(result/cnn_model1.keras) print(\n训练完成,模型文件保存至-- result/cnn_model1.keras) result_curve(history) def result_curve(result): # # 绘制出结果 plt.figure plt.subplot(121) plt.plot(result.epoch,result.history[accuracy],labelaccuracy) plt.plot(result.epoch,result.history[val_accuracy],labelval_accuracy) plt.scatter(result.epoch,result.history[accuracy]) plt.scatter(result.epoch,result.history[val_accuracy]) plt.legend(loclower right) plt.title(CNN) plt.subplot(122) plt.plot(result.epoch,result.history[loss],labelloss) plt.plot(result.epoch,result.history[val_loss],labelval_loss) plt.scatter(result.epoch,result.history[loss],marker*) plt.scatter(result.epoch,result.history[val_loss],marker*) plt.legend(locupper right) plt.title(CNN) plt.savefig(result/CNN1_curve.svg) if __name__ __main__: init_gpu() train_model(200)4、转为tflite模型后续供Android手机使用--keras2tflite.pyimport tensorflow as tf from keras.models import load_model import os from keras.saving import load_model from MFCC import MFCCLayer def keras_2_tflite(model_pt,out_pt): # 清除缓存 os.environ[TF_CPP_MIN_LOG_LEVEL] 3 model load_model(model_pt, safe_modeFalse) # 手动创建可转换的函数,解决 TypeError: NoneType object is not callable tf.function(jit_compileFalse) def inference_func(input_data): return model(input_data, trainingFalse) # 获取输入形状 input_shape model.input_shape concrete_func inference_func.get_concrete_function(tf.TensorSpec(input_shape, tf.float32)) #转换 TFLite converter tf.lite.TFLiteConverter.from_concrete_functions([concrete_func], model) # 必须加这两行否则 LSTM 无法转换 converter.target_spec.supported_ops [tf.lite.OpsSet.TFLITE_BUILTINS,tf.lite.OpsSet.SELECT_TF_OPS] converter.experimental_allow_custom_ops True # 优化模型可选 # converter.optimizations [tf.lite.Optimize.DEFAULT] tflite_model converter.convert() with open(out_pt, wb) as f: f.write(tflite_model) print( * 50) print(转换成功文件已保存到--, out_pt) if __name__ __main__: keras_2_tflite(model_ptresult/cnn_model.keras,out_ptresult/cnn_model.tflite)5、测试CNN模型import librosa import numpy as np from keras.models import load_model from read_data import FIXED_SAMPLE_RATE, MAX_LEN, N_MFCC from MFCC import MFCCLayer dct {0:关灯, 1:开灯, 2:关闭风扇, 3:打开风扇} def process_data(fpt): y_audio, sr librosa.load(fpt, srFIXED_SAMPLE_RATE) # 统一长度 if len(y_audio) MAX_LEN: y_audio y_audio[:MAX_LEN] else: y_audio np.pad(y_audio, (0, MAX_LEN - len(y_audio))) X np.array([y_audio]) return X def predict(file_path): model load_model(result/cnn_lstm_model.keras) X process_data(file_path) pred model.predict(X)[0] print(pred) y np.argmax(pred) cnf pred[y] if cnf 0.3: print(未识别) return None, None else: rst dct[y] print(f识别结果:{y}\t置信度:{cnf}) return str(y),str(round(cnf, 2)) if __name__ __main__: # 测试样本 file test_data/3_1774506934932.wav predict(file)6、api封装及测试api.pyimport os from flask import Flask, request, jsonify from test_cnn_model import predict app Flask(__name__) app.route(/predict_api, methods[POST]) def predict_api(): if file not in request.files: return jsonify({error: 没有文件}), 400 file request.files[file] # 保存临时文件 temp_path temp.wav file.save(temp_path) try: # 预测 rst, cnf predict(temp_path) return jsonify({ label: rst, confidence: cnf }) except Exception as e: return jsonify({error: str(e)}) finally: if os.path.exists(temp_path): os.remove(temp_path) if __name__ __main__: app.run(host0.0.0.0, port5000, debugTrue)api_test.pyimport requests # url http://192.168.1.3:5000/predict url http://localhost:5000/predict_api file_path test_data/0_1774506907089.wav files {file: open(file_path, rb)} res requests.post(url, filesfiles) print(res.json())