读取 wav 格式声音文件

http://bigsec.net/b52/scipydoc/wave_pyaudio.html

Python 支持 wav 文件的读写，实时的声音输入输出需要安装 pyAudio，pyMedia 进行 MP3 的解码和播放。

wav 是 Microsoft 开发的一种声音文件格式，通常被用来保存未压缩的声音数据（Pulse Code Modulation，PCM，脉冲编码调制)。wav 有三个重要的参数：声道数、采样频率和量化位数。

声道数：单声道 (mono) 或者是双声道 (stereo)。
采样频率：每秒钟声音信号的采集次数。常用的有 8kHz、16kHz、32kHz、48kHz、11.025kHz、22.05kHz、44.1kHz 等。
量化位数：用多少 bit 表达一次采样所采集的数据，通常有 8bit、16bit、24bit 和 32bit 等。CD 中所储存的声音信号是双声道、44.1kHz、16bit。

如果你需要自己录制和编辑声音文件，推荐使用 Audacity。它是一款开源的、跨平台、多声道的录音编辑软件。在工作中使用 Audacity 进行声音信号的录制，然后再输出成 wav 文件供 Python 程序处理。

1. `C:\Windows\media`

(base) yongqiang@yongqiang:~$ cd /mnt/f/yongqiang_work/
(base) yongqiang@yongqiang:/mnt/f/yongqiang_work$ ll
total 260
drwxrwxrwx 1 yongqiang yongqiang   4096 Jun  4 00:47 ./
drwxrwxrwx 1 yongqiang yongqiang   4096 Jun  3 22:11 ../
-rwxrwxrwx 1 yongqiang yongqiang 191788 Sep 15  2018 Windows_Ding.wav*
-rwxrwxrwx 1 yongqiang yongqiang  70060 Sep 15  2018 ding.wav*
(base) yongqiang@yongqiang:/mnt/f/yongqiang_work$
(base) yongqiang@yongqiang:/mnt/f/yongqiang_work$ pwd
/mnt/f/yongqiang_work
(base) yongqiang@yongqiang:/mnt/f/yongqiang_work$

2. 读 wav 格式声音文件

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# yongqiang chengfrom __future__ import absolute_import
from __future__ import division
from __future__ import print_functionimport wave
import numpy as np# WAV file
audio_file = "/mnt/f/yongqiang_work/ding.wav"
object = wave.open(audio_file, "rb")# (nchannels, sampwidth, framerate, nframes, comptype, compname)
params = object.getparams()
nchannels, sampwidth, framerate, nframes, comptype, compname = params[:6]
print("nchannels = ", nchannels)
print("sampwidth = ", sampwidth)
print("framerate = ", framerate)
print("nframes = ", nframes)
print("comptype = ", comptype)
print("compname = ", compname)# Returns number of audio channels (1 for mono, 2 for stereo).
print("object.getnchannels() = ", object.getnchannels())# Returns sample width in bytes.
print("object.getsampwidth() = ", object.getsampwidth())# Returns sampling frequency.
print("object.getframerate() = ", object.getframerate())# Returns number of audio frames.
print("object.getnframes() = ", object.getnframes())# Returns compression type ('NONE' is the only supported type).
print("object.getcomptype() = ", object.getcomptype())# Human-readable version of getcomptype(). Usually 'not compressed' parallels 'NONE'.
print("object.getcompname() = ", object.getcompname())# Reads and returns at most n frames of audio, as a bytes object.
str_data = object.readframes(nframes)
object.close()

/home/yongqiang/miniconda3/envs/tf_cpu_1.4.1/bin/python /home/yongqiang/pycharm_work/yongqiang.py
nchannels =  2
sampwidth =  2
framerate =  44100
nframes =  17504
comptype =  NONE
compname =  not compressed
object.getnchannels() =  2
object.getsampwidth() =  2
object.getframerate() =  44100
object.getnframes() =  17504
object.getcomptype() =  NONE
object.getcompname() =  not compressedProcess finished with exit code 0

3. 读 wav 格式声音文件

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# yongqiang chengfrom __future__ import absolute_import
from __future__ import division
from __future__ import print_functionimport wave
import numpy as np
import matplotlib.pyplot as plt# WAV file
audio_file = "/mnt/f/yongqiang_work/ding.wav"
object = wave.open(audio_file, "rb")# (nchannels, sampwidth, framerate, nframes, comptype, compname)
params = object.getparams()
nchannels, sampwidth, framerate, nframes, comptype, compname = params[:6]
print("nchannels =", nchannels)
print("sampwidth =", sampwidth)
print("framerate =", framerate)
print("nframes =", nframes)
print("comptype =", comptype)
print("compname =", compname)# Returns number of audio channels (1 for mono, 2 for stereo).
print("object.getnchannels() =", object.getnchannels())# Returns sample width in bytes.
print("object.getsampwidth() =", object.getsampwidth())# Returns sampling frequency.
print("object.getframerate() =", object.getframerate())# Returns number of audio frames.
print("object.getnframes() =", object.getnframes())# Returns compression type ('NONE' is the only supported type).
print("object.getcomptype() =", object.getcomptype())# Human-readable version of getcomptype(). Usually 'not compressed' parallels 'NONE'.
print("object.getcompname() =", object.getcompname())# Reads and returns at most n frames of audio, as a bytes object.
str_data = object.readframes(nframes)
# nframes = 17504,  channels = 2, sampwidth = 2
# str_data (bytes: 70016) = nframes * channels * sampwidth = 17504 * 2 * 2 = 70016
object.close()wave_data = np.fromstring(str_data, dtype=np.short)
wave_data.shape = -1, 2
wave_data = wave_data.T
time = np.arange(0, nframes) * (1.0 / framerate)plt.subplot(211)
plt.plot(time, wave_data[0])
plt.xlabel("left channel - time (seconds)")
plt.subplot(212)
plt.plot(time, wave_data[1], c="g")
plt.xlabel("right channel - time (seconds)")
plt.show()

/home/yongqiang/miniconda3/envs/pt-1.4_py-3.6/bin/python /home/yongqiang/pycharm_work/yongqiang.py
nchannels = 2
sampwidth = 2
framerate = 44100
nframes = 17504
comptype = NONE
compname = not compressed
object.getnchannels() = 2
object.getsampwidth() = 2
object.getframerate() = 44100
object.getnframes() = 17504
object.getcomptype() = NONE
object.getcompname() = not compressedProcess finished with exit code 0

Python 调用 wave.open 打开 wav 文件，注意需要使用 "rb" (二进制模式) 打开文件：

audio_file = "/mnt/f/yongqiang_work/ding.wav"
object = wave.open(audio_file, "rb")

open 返回一个 Wave_read 类的实例，通过调用它的方法读取 wav 文件的格式和数据：

getparams：一次性返回所有的 wav 文件的格式信息，它返回的是一个组元 (tuple)：声道数，量化位数 (byte 单位)，采样频率，采样点数，压缩类型，压缩类型的描述。wave 模块只支持非压缩的数据，因此可以忽略最后两个信息。

# (nchannels, sampwidth, framerate, nframes, comptype, compname)
params = object.getparams()
nchannels, sampwidth, framerate, nframes, comptype, compname = params[:6]

nchannels, sampwidth, framerate, nframes, comptype, compname 等方法可以单独返回 wav 文件的特定的信息。

readframes：读取声音数据，传递一个参数指定需要读取的长度 (以取样点为单位)，readframes 返回的是二进制数据 (bytes)，在 Python 中用字符串表示二进制数据。

# Reads and returns at most n frames of audio, as a bytes object.
str_data = object.readframes(nframes)
# nframes = 17504,  channels = 2, sampwidth = 2
# str_data (bytes: 70016) = nframes * channels * sampwidth = 17504 * 2 * 2 = 70016

接下来需要根据声道数和量化单位，将读取的二进制数据转换为一个可以计算的数组：

wave_data = np.fromstring(str_data, dtype=np.short)

通过 fromstring 函数将字符串转换为数组，通过其参数 dtype 指定转换后的数据格式，由于我们的声音格式是以两个字节表示一个取样值，因此采用 short 数据类型转换。现在得到的 wave_data 是一个一维的 short 类型的数组，但是因为我们的声音文件是双声道的，因此它由左右两个声道的取样交替构成：LRLRLRLR....LR (L 表示左声道的取样值，R 表示右声道取样值)。修改wave_data 的 sharp 之后：

wave_data.shape = -1, 2

将其转置得到：

wave_data = wave_data.T

最后通过取样点数和取样频率计算出每个取样的时间：

time = np.arange(0, nframes) * (1.0 / framerate)

4. sample width in bytes

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# yongqiang chengfrom __future__ import absolute_import
from __future__ import division
from __future__ import print_functionimport wave
import numpy as np
import matplotlib.pyplot as plt# WAV file
audio_file = "/mnt/f/yongqiang_work/ding.wav"
object = wave.open(audio_file, "rb")# (nchannels, sampwidth, framerate, nframes, comptype, compname)
params = object.getparams()
nchannels, sampwidth, framerate, nframes, comptype, compname = params[:6]
print("nchannels =", nchannels)
print("sampwidth =", sampwidth)
print("framerate =", framerate)
print("nframes =", nframes)
print("comptype =", comptype)
print("compname =", compname)# Returns number of audio channels (1 for mono, 2 for stereo).
print("object.getnchannels() =", object.getnchannels())# Returns sample width in bytes.
print("object.getsampwidth() =", object.getsampwidth())# Returns sampling frequency.
print("object.getframerate() =", object.getframerate())# Returns number of audio frames.
print("object.getnframes() =", object.getnframes())# Returns compression type ('NONE' is the only supported type).
print("object.getcomptype() =", object.getcomptype())# Human-readable version of getcomptype(). Usually 'not compressed' parallels 'NONE'.
print("object.getcompname() =", object.getcompname())# Reads and returns at most n frames of audio, as a bytes object.
str_data = object.readframes(nframes)
# nframes = 17504,  channels = 2, sampwidth = 2
# str_data (bytes: 70016) = nframes * channels * sampwidth = 17504 * 2 * 2 = 70016
num_bytes = len(str_data) # num_bytes = 70016
print("num_bytes =", num_bytes, "bytes")
object.close()wave_data = np.fromstring(str_data, dtype=np.short)
wave_data.shape = -1, 2
wave_data = wave_data.T
time = np.arange(0, nframes) * (1.0 / framerate)plt.subplot(211)
plt.plot(time, wave_data[0])
plt.xlabel("left channel - time (seconds)")
plt.subplot(212)
plt.plot(time, wave_data[1], c="g")
plt.xlabel("right channel - time (seconds)")
plt.show()

/home/yongqiang/miniconda3/envs/pt-1.4_py-3.6/bin/python /home/yongqiang/pytorch_work/end2end-asr-pytorch-example/yongqiang.py
nchannels = 2
sampwidth = 2
framerate = 44100
nframes = 17504
comptype = NONE
compname = not compressed
object.getnchannels() = 2
object.getsampwidth() = 2
object.getframerate() = 44100
object.getnframes() = 17504
object.getcomptype() = NONE
object.getcompname() = not compressed
num_bytes = 70016 bytesProcess finished with exit code 0

References

http://bigsec.net/b52/scipydoc/wave_pyaudio.html