Abstract
The paper discuses the usage of linear transformations of Hidden Markov Models, normally employed for speaker and environment adaptation, as a way of extracting the emotional components from the speech. A constrained version of Maximum Likelihood Linear Regression (CMLLR) transformation is used as a feature for classification of normal or aroused emotional state. We present a procedure of incrementally building a set of speaker independent acoustic models, that are used to estimate the CMLLR transformations for emotion classification. An audio-video database of spontaneous emotions (AvID) is briefly presented since it forms the basis for the evaluation of the proposed method. Emotion classification using the video part of the database is also described and the added value of combining the visual information with the audio features is shown.
Links
BibTeX (Download)
@conference{InterSp2009, title = {Emotion recognition using linear transformations in combination with video}, author = {Rok Gaj\v{s}ek and Vitomir \v{S}truc and Simon Dobri\v{s}ek and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/InSP.pdf}, year = {2009}, date = {2009-09-01}, booktitle = {Speech and intelligence: proceedings of Interspeech 2009}, pages = {1967-1970}, address = {Brighton, UK}, abstract = {The paper discuses the usage of linear transformations of Hidden Markov Models, normally employed for speaker and environment adaptation, as a way of extracting the emotional components from the speech. A constrained version of Maximum Likelihood Linear Regression (CMLLR) transformation is used as a feature for classification of normal or aroused emotional state. We present a procedure of incrementally building a set of speaker independent acoustic models, that are used to estimate the CMLLR transformations for emotion classification. An audio-video database of spontaneous emotions (AvID) is briefly presented since it forms the basis for the evaluation of the proposed method. Emotion classification using the video part of the database is also described and the added value of combining the visual information with the audio features is shown.}, keywords = {emotion recognition, facial expression recognition, interspeech, speech, speech technologies, spontaneous emotions}, pubstate = {published}, tppubtype = {conference} }