1+ import soundfile
2+ import numpy as np
3+ import librosa
4+ import glob
5+ import os
6+ from sklearn .model_selection import train_test_split
7+
8+ # all emotions on RAVDESS dataset
9+ int2emotion = {
10+ "01" : "neutral" ,
11+ "02" : "calm" ,
12+ "03" : "happy" ,
13+ "04" : "sad" ,
14+ "05" : "angry" ,
15+ "06" : "fearful" ,
16+ "07" : "disgust" ,
17+ "08" : "surprised"
18+ }
19+
20+ # we allow only these emotions
21+ AVAILABLE_EMOTIONS = {
22+ "angry" ,
23+ "sad" ,
24+ "neutral" ,
25+ "happy"
26+ }
27+
28+ def extract_feature (file_name , ** kwargs ):
29+ """
30+ Extract feature from audio file `file_name`
31+ Features supported:
32+ - MFCC (mfcc)
33+ - Chroma (chroma)
34+ - MEL Spectrogram Frequency (mel)
35+ - Contrast (contrast)
36+ - Tonnetz (tonnetz)
37+ e.g:
38+ `features = extract_feature(path, mel=True, mfcc=True)`
39+ """
40+ mfcc = kwargs .get ("mfcc" )
41+ chroma = kwargs .get ("chroma" )
42+ mel = kwargs .get ("mel" )
43+ contrast = kwargs .get ("contrast" )
44+ tonnetz = kwargs .get ("tonnetz" )
45+ with soundfile .SoundFile (file_name ) as sound_file :
46+ X = sound_file .read (dtype = "float32" )
47+ sample_rate = sound_file .samplerate
48+ if chroma or contrast :
49+ stft = np .abs (librosa .stft (X ))
50+ result = np .array ([])
51+ if mfcc :
52+ mfccs = np .mean (librosa .feature .mfcc (y = X , sr = sample_rate , n_mfcc = 40 ).T , axis = 0 )
53+ result = np .hstack ((result , mfccs ))
54+ if chroma :
55+ chroma = np .mean (librosa .feature .chroma_stft (S = stft , sr = sample_rate ).T ,axis = 0 )
56+ result = np .hstack ((result , chroma ))
57+ if mel :
58+ mel = np .mean (librosa .feature .melspectrogram (X , sr = sample_rate ).T ,axis = 0 )
59+ result = np .hstack ((result , mel ))
60+ if contrast :
61+ contrast = np .mean (librosa .feature .spectral_contrast (S = stft , sr = sample_rate ).T ,axis = 0 )
62+ result = np .hstack ((result , contrast ))
63+ if tonnetz :
64+ tonnetz = np .mean (librosa .feature .tonnetz (y = librosa .effects .harmonic (X ), sr = sample_rate ).T ,axis = 0 )
65+ result = np .hstack ((result , tonnetz ))
66+ return result
67+
68+
69+ def load_data (test_size = 0.2 ):
70+ X , y = [], []
71+ for file in glob .glob ("data/Actor_*/*.wav" ):
72+ # get the base name of the audio file
73+ basename = os .path .basename (file )
74+ # get the emotion label
75+ emotion = int2emotion [basename .split ("-" )[2 ]]
76+ # we allow only AVAILABLE_EMOTIONS we set
77+ if emotion not in AVAILABLE_EMOTIONS :
78+ continue
79+ # extract speech features
80+ features = extract_feature (file , mfcc = True , chroma = True , mel = True )
81+ # add to data
82+ X .append (features )
83+ y .append (emotion )
84+ # split the data to training and testing and return it
85+ return train_test_split (np .array (X ), y , test_size = test_size , random_state = 7 )
0 commit comments