1+ import pyaudio
2+ import os
3+ import wave
4+ import pickle
5+ from sys import byteorder
6+ from array import array
7+ from struct import pack
8+ from sklearn .neural_network import MLPClassifier
9+
10+ from utils import extract_feature
11+
12+ THRESHOLD = 500
13+ CHUNK_SIZE = 1024
14+ FORMAT = pyaudio .paInt16
15+ RATE = 16000
16+
17+ SILENCE = 30
18+
19+ def is_silent (snd_data ):
20+ "Returns 'True' if below the 'silent' threshold"
21+ return max (snd_data ) < THRESHOLD
22+
23+ def normalize (snd_data ):
24+ "Average the volume out"
25+ MAXIMUM = 16384
26+ times = float (MAXIMUM )/ max (abs (i ) for i in snd_data )
27+
28+ r = array ('h' )
29+ for i in snd_data :
30+ r .append (int (i * times ))
31+ return r
32+
33+ def trim (snd_data ):
34+ "Trim the blank spots at the start and end"
35+ def _trim (snd_data ):
36+ snd_started = False
37+ r = array ('h' )
38+
39+ for i in snd_data :
40+ if not snd_started and abs (i )> THRESHOLD :
41+ snd_started = True
42+ r .append (i )
43+
44+ elif snd_started :
45+ r .append (i )
46+ return r
47+
48+ # Trim to the left
49+ snd_data = _trim (snd_data )
50+
51+ # Trim to the right
52+ snd_data .reverse ()
53+ snd_data = _trim (snd_data )
54+ snd_data .reverse ()
55+ return snd_data
56+
57+ def add_silence (snd_data , seconds ):
58+ "Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
59+ r = array ('h' , [0 for i in range (int (seconds * RATE ))])
60+ r .extend (snd_data )
61+ r .extend ([0 for i in range (int (seconds * RATE ))])
62+ return r
63+
64+ def record ():
65+ """
66+ Record a word or words from the microphone and
67+ return the data as an array of signed shorts.
68+ Normalizes the audio, trims silence from the
69+ start and end, and pads with 0.5 seconds of
70+ blank sound to make sure VLC et al can play
71+ it without getting chopped off.
72+ """
73+ p = pyaudio .PyAudio ()
74+ stream = p .open (format = FORMAT , channels = 1 , rate = RATE ,
75+ input = True , output = True ,
76+ frames_per_buffer = CHUNK_SIZE )
77+
78+ num_silent = 0
79+ snd_started = False
80+
81+ r = array ('h' )
82+
83+ while 1 :
84+ # little endian, signed short
85+ snd_data = array ('h' , stream .read (CHUNK_SIZE ))
86+ if byteorder == 'big' :
87+ snd_data .byteswap ()
88+ r .extend (snd_data )
89+
90+ silent = is_silent (snd_data )
91+
92+ if silent and snd_started :
93+ num_silent += 1
94+ elif not silent and not snd_started :
95+ snd_started = True
96+
97+ if snd_started and num_silent > SILENCE :
98+ break
99+
100+ sample_width = p .get_sample_size (FORMAT )
101+ stream .stop_stream ()
102+ stream .close ()
103+ p .terminate ()
104+
105+ r = normalize (r )
106+ r = trim (r )
107+ r = add_silence (r , 0.5 )
108+ return sample_width , r
109+
110+ def record_to_file (path ):
111+ "Records from the microphone and outputs the resulting data to 'path'"
112+ sample_width , data = record ()
113+ data = pack ('<' + ('h' * len (data )), * data )
114+
115+ wf = wave .open (path , 'wb' )
116+ wf .setnchannels (1 )
117+ wf .setsampwidth (sample_width )
118+ wf .setframerate (RATE )
119+ wf .writeframes (data )
120+ wf .close ()
121+
122+
123+ if __name__ == "__main__" :
124+ # load the saved model (after training)
125+ model = pickle .load (open ("result/mlp_classifier.model" , "rb" ))
126+ print ("Please talk" )
127+ filename = "test.wav"
128+ # record the file (start talking)
129+ record_to_file (filename )
130+ # extract features and reshape it
131+ features = extract_feature (filename , mfcc = True , chroma = True , mel = True ).reshape (1 , - 1 )
132+ # predict
133+ result = model .predict (features )[0 ]
134+ # show the result !
135+ print ("result:" , result )
0 commit comments