(2条消息) 【开发日志】2022.09.02 ZENO----Audio----Beat detection algorithm----Combine Wav&Mp3_minimp3 和 ffmpeg_EndlessDaydream的博客-CSDN博客https://blog.csdn.net/Angelloveyatou/article/details/126670613

4 音频检测算法设计

4.1 节拍检测算法

4.1.1 节拍检测算法








3.快速傅里叶变换法:该方法通过对信号进行快速傅里叶变换(FFT)来检测节拍。 FFT将信号转换为频率域,其中可以检测到频率和强度。可以根据频率域中的能量峰值来确定信号的节拍。




4.1.2 基于声能的简单统计模型检测节拍


使用 1024 个样本的窗口大小和 44100 Hz 的采样率,我们需要一个 44100/1024 = 43 个元素的缓冲区来存储 1 秒的历史记录。此样本的值可以从FFT分析中获得。

我们将分析集中在频谱的第一小节中,这样做的原因是检查声音的较低频率以捕捉电池的踢鼓和军鼓的使用,电池是跟踪歌曲节奏的最常用乐器之一。在我们的实验中,我们将采用 60hz-130hz 的低音范围,我们将在其中找到底鼓,以及中低音 301hz-750hz,在那里可以找到军鼓声音。中低音包含大多数乐器的低次谐波,通常被视为低音存在范围。

因此,我们需要获取此范围内的声音信息,并获取FFT结果的相应元素。要获得FFT结果中每个元素的频率,我们只需要计算频率分割(44100/1024 = 43)并将其乘以数据数组的索引。所以第一个组件存储范围 0-43Hz 的结果,第二个组件存储 43-86Hz,第三个 86-129Hz 的结果......


假设 k 和 k+n 是实际处理范围的极限,FFT[i] 是 i 位置的频率幅度。我们可以计算范围的当前能量为

我们需要将此值与接下来的 42 个样本一起存储,以获得 1 秒的历史记录 (H)。



我们可以定义一条线(方差,阈值)方程来表示阈值和方差之间的关系。以 (0, 1.55) (0.02, 1.25) 作为这条线的两个点。

我们的 FFT 结果在 0..1 范围内,因此方差值也在 0..1 范围内。


输出1 ,反之输出0,从而生成01序列输出到下一个结点。









4.1.3 本系统中部分音频结点


    struct AudioBeats : zeno::INode {std::deque<double> H;virtual void apply() override {auto wave = get_input<PrimitiveObject>("wave");float threshold = get_input<NumericObject>("threshold")->get<float>();auto start_time = get_input<NumericObject>("time")->get<float>();float sampleFrequency = wave->userData().get<zeno::NumericObject>("SampleRate")->get<float>();int start_index = int(sampleFrequency * start_time);int duration_count = 1024;auto fft = Aquila::FftFactory::getFft(duration_count);std::vector<double> samples;samples.resize(duration_count);for (auto i = 0; i < duration_count; i++) {
//                if (start_index + i >= wave->size()) {
//                    break;
//                }samples[i] = wave->attr<float>("value")[min((start_index + i), wave->size()-1)];//if (start_index + i >= wave->size()) {//    break;//}//samples[i] = wave->attr<float>("value")[start_index + i];}Aquila::SpectrumType spectrums = fft->fft(samples.data());{double E = 0;for (const auto& spectrum: spectrums) {E += spectrum.real() * spectrum.real() + spectrum.imag() * spectrum.imag();}E /= duration_count;H.push_back(E);}while (H.size() > 43) {H.pop_front();}double avg_H = 0;for (const auto& E: H) {avg_H += E;}avg_H /= H.size();double var_H = 0;for (const auto& E: H) {var_H += (E - avg_H) * (E - avg_H);}var_H /= H.size();int beat = H.back() - threshold > (-15 * var_H + 1.55) * avg_H;set_output("beat", std::make_shared<NumericObject>(beat));set_output("var_H", std::make_shared<NumericObject>((float)var_H));auto output_H = std::make_shared<ListObject>();for (int i = 0; i < 43 - H.size(); i++) {output_H->arr.emplace_back(std::make_shared<NumericObject>((float)0));}for (const auto & h: H) {output_H->arr.emplace_back(std::make_shared<NumericObject>((float)h));}set_output("H", output_H);auto output_E = std::make_shared<ListObject>();for (const auto& spectrum: spectrums) {double e = spectrum.real() * spectrum.real() + spectrum.imag() * spectrum.imag();output_E->arr.emplace_back(std::make_shared<NumericObject>((float)e));}set_output("E", output_E);}};ZENDEFNODE(AudioBeats, {{"wave",{"float", "time", "0"},{"float", "threshold", "0.005"},},{"beat","var_H","H","E",},{},{"audio"},});struct AudioEnergy : zeno::INode {double minE = std::numeric_limits<double>::max();double maxE = std::numeric_limits<double>::min();std::vector<double> init;virtual void apply() override {auto wave = get_input<PrimitiveObject>("wave");int duration_count = 1024;if (init.empty()) {auto fft = Aquila::FftFactory::getFft(duration_count);int clip_count = wave->size() / duration_count;init.reserve(clip_count);for (auto i = 0; i < clip_count; i++) {std::vector<double> samples;samples.resize(duration_count);for (auto j = 0; j < duration_count; j++) {samples[j] = wave->attr<float>("value")[min(duration_count * i + j, wave->size()-1)];}Aquila::SpectrumType spectrums = fft->fft(samples.data());{double E = 0;for (const auto& spectrum: spectrums) {E += spectrum.real() * spectrum.real() + spectrum.imag() * spectrum.imag();}E /= duration_count;minE = min(minE, E);maxE = max(maxE, E);init.push_back(E);}}//            for (auto i = 0; i < clip_count; i++) {//                init[i] = init[i] / maxE;//            }}//        auto vis = std::make_shared<PrimitiveObject>();//        vis->resize(init.size());//        auto &index = vis->add_attr<float>("index");//        auto &listE = vis->add_attr<float>("E");//        for (auto i = 0; i < init.size(); i++) {//            index[i] = i;//            listE[i] = init[i];//        }//        set_output("vis", vis);set_output("minE", std::make_shared<NumericObject>((float)minE));set_output("maxE", std::make_shared<NumericObject>((float)maxE));auto start_time = get_input2<float>("time");float sampleFrequency = wave->userData().get<zeno::NumericObject>("SampleRate")->get<float>();int start_index = int(sampleFrequency * start_time);auto fft = Aquila::FftFactory::getFft(duration_count);std::vector<double> samples;samples.resize(duration_count);for (auto i = 0; i < duration_count; i++) {samples[i] = wave->attr<float>("value")[min((start_index + i), wave->size()-1)];}Aquila::SpectrumType spectrums = fft->fft(samples.data());double E = 0;for (const auto& spectrum: spectrums) {E += spectrum.real() * spectrum.real() + spectrum.imag() * spectrum.imag();}E /= duration_count;set_output("E", std::make_shared<NumericObject>((float)E));double uniE = (E - minE) / (maxE - minE);set_output("uniE", std::make_shared<NumericObject>((float)uniE));start_index /= duration_count;start_index = min(start_index, init.size() - 1);std::vector<double> _queue;for (int i = max(start_index - 43, 0); i < start_index; i++) {_queue.push_back((init[i] - minE) / (maxE - minE));}if (_queue.size() > 0) {double avg_H = 0;for (const double & e: _queue) {avg_H += e;}avg_H /= _queue.size();double var_H = 0;for (const double & e: _queue) {var_H += (e - avg_H) * (e - avg_H);}var_H /= _queue.size();double std_H = sqrt(var_H);//            zeno::log_info("E: {}, avg_H: {}, std_H: {}, var_H: {}", uniE, avg_H, std_H, var_H);float threshold = get_input2<float>("threshold");int beat = uniE > avg_H + std_H * threshold;set_output("beat", std::make_shared<NumericObject>(beat));}else {set_output("beat", std::make_shared<NumericObject>(0));}}};ZENDEFNODE(AudioEnergy, {{"wave",{"float", "time", "0"},{"float", "threshold", "1"},},{"beat","E","uniE","minE","maxE",
//            "vis",},{},{"audio"},});struct AudioFFT : zeno::INode {virtual void apply() override {auto wave = get_input<PrimitiveObject>("wave");int duration_count = 1024;auto start_time = get_input2<float>("time");float sampleFrequency = wave->userData().get<zeno::NumericObject>("SampleRate")->get<float>();int start_index = int(sampleFrequency * start_time);std::vector<double> samples;samples.resize(duration_count+1);for (auto i = 0; i < duration_count+1; i++) {samples[i] = wave->attr<float>("value")[min((start_index + i), wave->size()-1)];}auto pre_emphasis = get_input2<int>("preEmphasis");if (pre_emphasis) {auto alpha = get_input2<float>("preEmphasisAlpha");for (auto i = 0; i < duration_count; i++) {samples[i] = samples[i+1] - alpha * samples[i];}}samples.pop_back();auto hamming_window = get_input2<int>("hammingWindow");if (hamming_window) {for (auto i = 0; i < duration_count; i++) {double i_value = 0.54 - 0.46 * std::cos(2.0 * M_PI * i / (duration_count - 1));samples[i] = samples[i] * i_value;}}auto fft = Aquila::FftFactory::getFft(duration_count);Aquila::SpectrumType spectrums = fft->fft(samples.data());auto fft_prim = std::make_shared<PrimitiveObject>();fft_prim->resize(duration_count / 2 + 1);auto &freq = fft_prim->add_attr<float>("freq");auto &real = fft_prim->add_attr<float>("real");auto &image = fft_prim->add_attr<float>("image");auto &square = fft_prim->add_attr<float>("square");auto &power = fft_prim->add_attr<float>("power");for (std::size_t i = 0; i < fft_prim->verts.size(); ++i) {float r = spectrums[i].real();float im = spectrums[i].imag();freq[i] = float(i);real[i] = r;image[i] = im;float square_v = r * r + im * im;square[i] = square_v;power[i] = square_v / duration_count;}set_output("FFTPrim", fft_prim);}};ZENDEFNODE(AudioFFT, {{"wave",{"float", "time", "0"},{"bool", "preEmphasis", "0"},{"float", "preEmphasisAlpha", "0.97"},{"bool", "hammingWindow", "1"},},{"FFTPrim",},{},{"audio"},});struct MelFilter : zeno::INode {virtual void apply() override {auto fftPrim = get_input<PrimitiveObject>("FFTPrim");auto &power = fftPrim->attr<float>("power");auto sampleFreq = get_input2<float>("sampleFreq");auto rangePerFilter = get_input2<float>("rangePerFilter");float halfFreq = sampleFreq / 2;auto count = get_input2<int>("count");std::vector<float> hz_points;float mel_fh = 2595.0 * log10(1+halfFreq/700.0);for (int i = 0; i <= count + 1; i++) {float mel = mel_fh * i / (count + 1);float hz = 700.0 * (pow(10.0, mel / 2595.0) - 1);hz_points.push_back(hz);}std::vector<int> bin;for (const auto& hz: hz_points) {int index = (1024.0+1.0) * hz / sampleFreq;bin.push_back(index);}auto fbank = std::make_shared<PrimitiveObject>();fbank->resize(count);auto& fbank_v = fbank->add_attr<float>("fbank");for (auto i = 1; i <= count; i++) {int s = bin[i-1];int m = bin[i];int e = bin[i+1];s = (int) zaudio::lerp(m, s, rangePerFilter);e = (int) zaudio::lerp(m, e, rangePerFilter);float total = 0;for (auto i = s; i < m; i++) {float cof = (float)(m - i) / (float)(m - s);total += power[i] * cof;}for (auto i = m; i < e; i++) {float cof = 1 - (float)(m - i) / (float)(e - m);total += power[i] * cof;}if (total == 0) {fbank_v[i-1] = std::numeric_limits<float>::min();}else {fbank_v[i-1] = log(total);}}auto indexType = get_input2<std::string>("indexType");if (indexType == "index") {auto& index = fbank->add_attr<float>("i");for (auto i = 1; i <= count; i++) {index[i-1] = (float)(i-1);};} else if (indexType == "indexdivcount") {auto& index = fbank->add_attr<float>("i");for (auto i = 1; i <= count; i++) {index[i-1] = (float)(i-1) /count;};}set_output("FilterBank", fbank);}};ZENDEFNODE(MelFilter, {{"FFTPrim",{"int", "count", "15"},{"float", "sampleFreq", "44100"},{"float", "rangePerFilter", "1"},{"enum none index indexdivcount", "indexType", "index"},},{"FilterBank",},{},{"audio",},});
} // namespace zeno


BEAT DETECTION ALGORITHMS.doc (parallelcube.com)https://www.parallelcube.com/web/wp-content/uploads/2018/03/BeatDetectionAlgorithms.pdf


  1. "A Review on Audio Event Detection," H. Su, et al., IEEE Access, vol. 8, pp. 77580-77593, 2020.

  2. "Acoustic Event Detection with SEDNN: A Deep Learning Approach," P. Jaiswal and Y. Han, 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton, United Kingdom, 2019, pp. 316-320.

  3. "Event Detection Using Multitask Learning of Auditory Features and Sound Event Classifiers," D. D. Lee, et al., IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 25, no. 6, pp. 1190-1201, 2017.

  4. "Audio Event Detection Using Deep Learning with Mel-Frequency Cepstral Coefficients," S. Gupta, et al., 2019 IEEE 10th Annual Information Technology, Electronics and Mobile Communication Conference (IEMCON), Vancouver, BC, Canada, 2019, pp. 186-191.

  5. "Deep Convolutional Neural Networks for Acoustic Event Detection in Domestic Environments," M. L. Seltzer, et al., IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 24, no. 1, pp. 111-125, 2016.

  6. "Environmental Sound Classification with Convolutional Neural Networks," J. Salamon, et al., IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Brisbane, QLD, Australia, 2015, pp. 732-736.



