2016 |
Janez Križaj; Simon Dobrišek; France Mihelič; Vitomir Štruc Facial Landmark Localization from 3D Images Inproceedings In: Proceedings of the Electrotechnical and Computer Science Conference (ERK), Portorož, Slovenia, 2016. @inproceedings{ERK2016Janez, title = {Facial Landmark Localization from 3D Images}, author = {Janez Kri\v{z}aj and Simon Dobri\v{s}ek and France Miheli\v{c} and Vitomir \v{S}truc}, year = {2016}, date = {2016-09-20}, booktitle = {Proceedings of the Electrotechnical and Computer Science Conference (ERK)}, address = {Portoro\v{z}, Slovenia}, abstract = {A novel method for automatic facial landmark localization is presented. The method builds on the supervised descent framework, which was shown to successfully localize landmarks in the presence of large expression variations and mild occlusions, but struggles when localizing landmarks on faces with large pose variations. We propose an extension of the supervised descent framework which trains multiple descent maps and results in increased robustness to pose variations. The performance of the proposed method is demonstrated on the Bosphorus database for the problem of facial landmark localization from 3D data. Our experimental results show that the proposed method exhibits increased robustness to pose variations, while retaining high performance in the case of expression and occlusion variations.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } A novel method for automatic facial landmark localization is presented. The method builds on the supervised descent framework, which was shown to successfully localize landmarks in the presence of large expression variations and mild occlusions, but struggles when localizing landmarks on faces with large pose variations. We propose an extension of the supervised descent framework which trains multiple descent maps and results in increased robustness to pose variations. The performance of the proposed method is demonstrated on the Bosphorus database for the problem of facial landmark localization from 3D data. Our experimental results show that the proposed method exhibits increased robustness to pose variations, while retaining high performance in the case of expression and occlusion variations. |
Simon Dobrišek; David Čefarin; Vitomir Štruc; France Mihelič Assessment of the Google Speech Application Programming Interface for Automatic Slovenian Speech Recognition Inproceedings In: Jezikovne Tehnologije in Digitalna Humanistika, 2016. @inproceedings{SJDT, title = {Assessment of the Google Speech Application Programming Interface for Automatic Slovenian Speech Recognition}, author = {Simon Dobri\v{s}ek and David \v{C}efarin and Vitomir \v{S}truc and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/jtdh16-ulfe-luks-sd-final-pdfa.pdf}, year = {2016}, date = {2016-09-20}, booktitle = {Jezikovne Tehnologije in Digitalna Humanistika}, abstract = {Automatic speech recognizers are slowly maturing into technologies that enable humans to communicate more naturally and effectively with a variety of smart devices and information-communication systems. Large global companies such as Google, Microsoft, Apple, IBM and Baidu compete in developing the most reliable speech recognizers, supporting as many of the main world languages as possible. Due to the relatively small number of speakers, the support for the Slovenian spoken language is lagging behind, and among the major global companies only Google has recently supported our spoken language. The paper presents the results of our independent assessment of the Google speech-application programming interface for automatic Slovenian speech recognition. For the experiments, we used speech databases that are otherwise used for the development and assessment of Slovenian speech recognizers.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Automatic speech recognizers are slowly maturing into technologies that enable humans to communicate more naturally and effectively with a variety of smart devices and information-communication systems. Large global companies such as Google, Microsoft, Apple, IBM and Baidu compete in developing the most reliable speech recognizers, supporting as many of the main world languages as possible. Due to the relatively small number of speakers, the support for the Slovenian spoken language is lagging behind, and among the major global companies only Google has recently supported our spoken language. The paper presents the results of our independent assessment of the Google speech-application programming interface for automatic Slovenian speech recognition. For the experiments, we used speech databases that are otherwise used for the development and assessment of Slovenian speech recognizers. |
2015 |
Tadej Justin; Vitomir Štruc; Simon Dobrišek; Boštjan Vesnicer; Ivo Ipšić; France Mihelič Speaker de-identification using diphone recognition and speech synthesis Conference 11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (IEEE FG): DeID 2015, 4 , IEEE 2015. @conference{justin2015speaker, title = {Speaker de-identification using diphone recognition and speech synthesis}, author = { Tadej Justin and Vitomir \v{S}truc and Simon Dobri\v{s}ek and Bo\v{s}tjan Vesnicer and Ivo Ip\v{s}i\'{c} and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/Deid2015.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (IEEE FG): DeID 2015}, volume = {4}, pages = {1--7}, organization = {IEEE}, abstract = {The paper addresses the problem of speaker (or voice) de-identification by presenting a novel approach for concealing the identity of speakers in their speech. The proposed technique first recognizes the input speech with a diphone recognition system and then transforms the obtained phonetic transcription into the speech of another speaker with a speech synthesis system. Due to the fact that a Diphone RecOgnition step and a sPeech SYnthesis step are used during the deidentification, we refer to the developed technique as DROPSY. With this approach the acoustical models of the recognition and synthesis modules are completely independent from each other, which ensures the highest level of input speaker deidentification. The proposed DROPSY-based de-identification approach is language dependent, text independent and capable of running in real-time due to the relatively simple computing methods used. When designing speaker de-identification technology two requirements are typically imposed on the deidentification techniques: i) it should not be possible to establish the identity of the speakers based on the de-identified speech, and ii) the processed speech should still sound natural and be intelligible. This paper, therefore, implements the proposed DROPSY-based approach with two different speech synthesis techniques (i.e, with the HMM-based and the diphone TDPSOLA- based technique). The obtained de-identified speech is evaluated for intelligibility and evaluated in speaker verification experiments with a state-of-the-art (i-vector/PLDA) speaker recognition system. The comparison of both speech synthesis modules integrated in the proposed method reveals that both can efficiently de-identify the input speakers while still producing intelligible speech.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The paper addresses the problem of speaker (or voice) de-identification by presenting a novel approach for concealing the identity of speakers in their speech. The proposed technique first recognizes the input speech with a diphone recognition system and then transforms the obtained phonetic transcription into the speech of another speaker with a speech synthesis system. Due to the fact that a Diphone RecOgnition step and a sPeech SYnthesis step are used during the deidentification, we refer to the developed technique as DROPSY. With this approach the acoustical models of the recognition and synthesis modules are completely independent from each other, which ensures the highest level of input speaker deidentification. The proposed DROPSY-based de-identification approach is language dependent, text independent and capable of running in real-time due to the relatively simple computing methods used. When designing speaker de-identification technology two requirements are typically imposed on the deidentification techniques: i) it should not be possible to establish the identity of the speakers based on the de-identified speech, and ii) the processed speech should still sound natural and be intelligible. This paper, therefore, implements the proposed DROPSY-based approach with two different speech synthesis techniques (i.e, with the HMM-based and the diphone TDPSOLA- based technique). The obtained de-identified speech is evaluated for intelligibility and evaluated in speaker verification experiments with a state-of-the-art (i-vector/PLDA) speaker recognition system. The comparison of both speech synthesis modules integrated in the proposed method reveals that both can efficiently de-identify the input speakers while still producing intelligible speech. |
Simon Dobrišek; Vitomir Štruc; Janez Križaj; France Mihelič Face recognition in the wild with the Probabilistic Gabor-Fisher Classifier Conference 11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (IEEE FG): BWild 2015, 2 , IEEE 2015. @conference{dobrivsek2015face, title = {Face recognition in the wild with the Probabilistic Gabor-Fisher Classifier}, author = { Simon Dobri\v{s}ek and Vitomir \v{S}truc and Janez Kri\v{z}aj and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/Bwild2015.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (IEEE FG): BWild 2015}, volume = {2}, pages = {1--6}, organization = {IEEE}, abstract = {The paper addresses the problem of face recognition in the wild. It introduces a novel approach to unconstrained face recognition that exploits Gabor magnitude features and a simplified version of the probabilistic linear discriminant analysis (PLDA). The novel approach, named Probabilistic Gabor-Fisher Classifier (PGFC), first extracts a vector of Gabor magnitude features from the given input image using a battery of Gabor filters, then reduces the dimensionality of the extracted feature vector by projecting it into a low-dimensional subspace and finally produces a representation suitable for identity inference by applying PLDA to the projected feature vector. The proposed approach extends the popular Gabor-Fisher Classifier (GFC) to a probabilistic setting and thus improves on the generalization capabilities of the GFC method. The PGFC technique is assessed in face verification experiments on the Point and Shoot Face Recognition Challenge (PaSC) database, which features real-world videos of subjects performing everyday tasks. Experimental results on this challenging database show the feasibility of the proposed approach, which improves on the best results on this database reported in the literature by the time of writing.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The paper addresses the problem of face recognition in the wild. It introduces a novel approach to unconstrained face recognition that exploits Gabor magnitude features and a simplified version of the probabilistic linear discriminant analysis (PLDA). The novel approach, named Probabilistic Gabor-Fisher Classifier (PGFC), first extracts a vector of Gabor magnitude features from the given input image using a battery of Gabor filters, then reduces the dimensionality of the extracted feature vector by projecting it into a low-dimensional subspace and finally produces a representation suitable for identity inference by applying PLDA to the projected feature vector. The proposed approach extends the popular Gabor-Fisher Classifier (GFC) to a probabilistic setting and thus improves on the generalization capabilities of the GFC method. The PGFC technique is assessed in face verification experiments on the Point and Shoot Face Recognition Challenge (PaSC) database, which features real-world videos of subjects performing everyday tasks. Experimental results on this challenging database show the feasibility of the proposed approach, which improves on the best results on this database reported in the literature by the time of writing. |
Tadej Justin; Vitomir Štruc; Janez Žibert; France Mihelič Development and Evaluation of the Emotional Slovenian Speech Database-EmoLUKS Conference Proceedings of the International Conference on Text, Speech, and Dialogue (TSD), Springer 2015. @conference{justin2015development, title = {Development and Evaluation of the Emotional Slovenian Speech Database-EmoLUKS}, author = { Tadej Justin and Vitomir \v{S}truc and Janez \v{Z}ibert and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/tsd2015.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the International Conference on Text, Speech, and Dialogue (TSD)}, pages = {351--359}, organization = {Springer}, abstract = {This paper describes a speech database built from 17 Slovenian radio dramas. The dramas were obtained from the national radio-and-television station (RTV Slovenia) and were given at the universities disposal with an academic license for processing and annotating the audio material. The utterances of one male and one female speaker were transcribed, segmented and then annotated with emotional states of the speakers. The annotation of the emotional states was conducted in two stages with our own web-based application for crowd sourcing. The final (emotional) speech database consists of 1385 recordings of one male (975 recordings) and one female (410 recordings) speaker and contains labeled emotional speech with a total duration of around 1 hour and 15 minutes. The paper presents the two-stage annotation process used to label the data and demonstrates the usefulness of the employed annotation methodology. Baseline emotion recognition experiments are also presented. The reported results are presented with the un-weighted as well as weighted average recalls and precisions for 2-class and 7-class recognition experiments.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } This paper describes a speech database built from 17 Slovenian radio dramas. The dramas were obtained from the national radio-and-television station (RTV Slovenia) and were given at the universities disposal with an academic license for processing and annotating the audio material. The utterances of one male and one female speaker were transcribed, segmented and then annotated with emotional states of the speakers. The annotation of the emotional states was conducted in two stages with our own web-based application for crowd sourcing. The final (emotional) speech database consists of 1385 recordings of one male (975 recordings) and one female (410 recordings) speaker and contains labeled emotional speech with a total duration of around 1 hour and 15 minutes. The paper presents the two-stage annotation process used to label the data and demonstrates the usefulness of the employed annotation methodology. Baseline emotion recognition experiments are also presented. The reported results are presented with the un-weighted as well as weighted average recalls and precisions for 2-class and 7-class recognition experiments. |
2014 |
Janez Križaj; Vitomir Štruc; France Mihelič A Feasibility Study on the Use of Binary Keypoint Descriptors for 3D Face Recognition Conference Proceedings of the Mexican Conference on Pattern Recognition (MCPR), Springer 2014. @conference{krivzaj2014feasibility, title = {A Feasibility Study on the Use of Binary Keypoint Descriptors for 3D Face Recognition}, author = { Janez Kri\v{z}aj and Vitomir \v{S}truc and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/MCPR2014.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the Mexican Conference on Pattern Recognition (MCPR)}, pages = {142--151}, organization = {Springer}, abstract = {Despite the progress made in the area of local image descriptors in recent years, virtually no literature is available on the use of more recent descriptors for the problem of 3D face recognition, such as BRIEF, ORB, BRISK or FREAK, which are binary in nature and, therefore, tend to be faster to compute and match, while requiring signicantly less memory for storage than, for example, SIFT or SURF. In this paper, we try to close this gap and present a feasibility study on the use of these descriptors for 3D face recognition. Descriptors are evaluated on the three challenging 3D face image datasets, namely, the FRGC, UMB and CASIA. Our experiments show the binary descriptors ensure slightly lower verication rates than SIFT, comparable to those of the SURF descriptor, while being an order of magnitude faster than SIFT. The results suggest that the use of binary descriptors represents a viable alternative to the established descriptors.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Despite the progress made in the area of local image descriptors in recent years, virtually no literature is available on the use of more recent descriptors for the problem of 3D face recognition, such as BRIEF, ORB, BRISK or FREAK, which are binary in nature and, therefore, tend to be faster to compute and match, while requiring signicantly less memory for storage than, for example, SIFT or SURF. In this paper, we try to close this gap and present a feasibility study on the use of these descriptors for 3D face recognition. Descriptors are evaluated on the three challenging 3D face image datasets, namely, the FRGC, UMB and CASIA. Our experiments show the binary descriptors ensure slightly lower verication rates than SIFT, comparable to those of the SURF descriptor, while being an order of magnitude faster than SIFT. The results suggest that the use of binary descriptors represents a viable alternative to the established descriptors. |
2013 |
Simon Dobrišek; Rok Gajšek; France Mihelič; Nikola Pavešić; Vitomir Štruc Towards efficient multi-modal emotion recognition Journal Article In: International Journal of Advanced Robotic Systems, 10 (53), 2013. @article{dobrivsek2013towards, title = {Towards efficient multi-modal emotion recognition}, author = { Simon Dobri\v{s}ek and Rok Gaj\v{s}ek and France Miheli\v{c} and Nikola Pave\v{s}i\'{c} and Vitomir \v{S}truc}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/multimodel-emotion.pdf}, doi = {10.5772/54002}, year = {2013}, date = {2013-01-01}, journal = {International Journal of Advanced Robotic Systems}, volume = {10}, number = {53}, abstract = {The paper presents a multi-modal emotion recognition system exploiting audio and video (i.e., facial expression) information. The system first processes both sources of information individually to produce corresponding matching scores and then combines the computed matching scores to obtain a classification decision. For the video part of the system, a novel approach to emotion recognition, relying on image-set matching, is developed. The proposed approach avoids the need for detecting and tracking specific facial landmarks throughout the given video sequence, which represents a common source of error in video-based emotion recognition systems, and, therefore, adds robustness to the video processing chain. The audio part of the system, on the other hand, relies on utterance-specific Gaussian Mixture Models (GMMs) adapted from a Universal Background Model (UBM) via the maximum a posteriori probability (MAP) estimation. It improves upon the standard UBM-MAP procedure by exploiting gender information when building the utterance-specific GMMs, thus ensuring enhanced emotion recognition performance. Both the uni-modal parts as well as the combined system are assessed on the challenging multi-modal eNTERFACE'05 corpus with highly encouraging results. The developed system represents a feasible solution to emotion recognition that can easily be integrated into various systems, such as humanoid robots, smart surveillance systems and alike.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The paper presents a multi-modal emotion recognition system exploiting audio and video (i.e., facial expression) information. The system first processes both sources of information individually to produce corresponding matching scores and then combines the computed matching scores to obtain a classification decision. For the video part of the system, a novel approach to emotion recognition, relying on image-set matching, is developed. The proposed approach avoids the need for detecting and tracking specific facial landmarks throughout the given video sequence, which represents a common source of error in video-based emotion recognition systems, and, therefore, adds robustness to the video processing chain. The audio part of the system, on the other hand, relies on utterance-specific Gaussian Mixture Models (GMMs) adapted from a Universal Background Model (UBM) via the maximum a posteriori probability (MAP) estimation. It improves upon the standard UBM-MAP procedure by exploiting gender information when building the utterance-specific GMMs, thus ensuring enhanced emotion recognition performance. Both the uni-modal parts as well as the combined system are assessed on the challenging multi-modal eNTERFACE'05 corpus with highly encouraging results. The developed system represents a feasible solution to emotion recognition that can easily be integrated into various systems, such as humanoid robots, smart surveillance systems and alike. |
2010 |
Vitomir Štruc; Boštjan Vesnicer; France Mihelič; Nikola Pavešić Removing Illumination Artifacts from Face Images using the Nuisance Attribute Projection Conference Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP'10), IEEE, Dallas, Texas, USA, 2010. @conference{ICASSP2010, title = {Removing Illumination Artifacts from Face Images using the Nuisance Attribute Projection}, author = {Vitomir \v{S}truc and Bo\v{s}tjan Vesnicer and France Miheli\v{c} and Nikola Pave\v{s}i\'{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/ICASSP2010.pdf}, doi = {10.1109/ICASSP.2010.5495203}, year = {2010}, date = {2010-03-01}, booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP'10)}, pages = {846-849}, publisher = {IEEE}, address = {Dallas, Texas, USA}, abstract = {Illumination induced appearance changes represent one of the open challenges in automated face recognition systems still significantly influencing their performance. Several techniques have been presented in the literature to cope with this problem; however, a universal solution remains to be found. In this paper we present a novel normalization scheme based on the nuisance attribute projection (NAP), which tries to remove the effects of illumination by projecting away multiple dimensions of a low dimensional illumination subspace. The technique is assessed in face recognition experiments performed on the extended YaleB and XM2VTS databases. Comparative results with state-of-the-art techniques show the competitiveness of the proposed technique.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Illumination induced appearance changes represent one of the open challenges in automated face recognition systems still significantly influencing their performance. Several techniques have been presented in the literature to cope with this problem; however, a universal solution remains to be found. In this paper we present a novel normalization scheme based on the nuisance attribute projection (NAP), which tries to remove the effects of illumination by projecting away multiple dimensions of a low dimensional illumination subspace. The technique is assessed in face recognition experiments performed on the extended YaleB and XM2VTS databases. Comparative results with state-of-the-art techniques show the competitiveness of the proposed technique. |
Rok Gajšek; Vitomir Štruc; France Mihelič Multi-modal Emotion Recognition using Canonical Correlations and Acustic Features Conference Proceedings of the International Conference on Pattern Recognition (ICPR), IAPR Istanbul, Turkey, 2010. @conference{ICPR_Gajsek_2010, title = {Multi-modal Emotion Recognition using Canonical Correlations and Acustic Features}, author = {Rok Gaj\v{s}ek and Vitomir \v{S}truc and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/ICPR2010_Emo.pdf}, year = {2010}, date = {2010-01-01}, booktitle = {Proceedings of the International Conference on Pattern Recognition (ICPR)}, pages = {4133-4136}, address = {Istanbul, Turkey}, organization = {IAPR}, abstract = {The information of the psycho-physical state of the subject is becoming a valuable addition to the modern audio or video recognition systems. As well as enabling a better user experience, it can also assist in superior recognition accuracy of the base system. In the article, we present our approach to multi-modal (audio-video) emotion recognition system. For audio sub-system, a feature set comprised of prosodic, spectral and cepstrum features is selected and support vector classifier is used to produce the scores for each emotional category. For video sub-system a novel approach is presented, which does not rely on the tracking of specific facial landmarks and thus, eliminates the problems usually caused, if the tracking algorithm fails at detecting the correct area. The system is evaluated on the eNTERFACE database and the recognition accuracy of our audio-video fusion is compared to the published results in the literature.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The information of the psycho-physical state of the subject is becoming a valuable addition to the modern audio or video recognition systems. As well as enabling a better user experience, it can also assist in superior recognition accuracy of the base system. In the article, we present our approach to multi-modal (audio-video) emotion recognition system. For audio sub-system, a feature set comprised of prosodic, spectral and cepstrum features is selected and support vector classifier is used to produce the scores for each emotional category. For video sub-system a novel approach is presented, which does not rely on the tracking of specific facial landmarks and thus, eliminates the problems usually caused, if the tracking algorithm fails at detecting the correct area. The system is evaluated on the eNTERFACE database and the recognition accuracy of our audio-video fusion is compared to the published results in the literature. |
Rok Gajšek; Vitomir Štruc; France Mihelič Multi-modal Emotion Recognition based on the Decoupling of Emotion and Speaker Information Conference Proceedings of Text, Speech and Dialogue (TSD), 6231/2010 , Lecture Notes on Computer Science Springer-Verlag, Berlin, Heidelberg, 2010. @conference{TSD_Emo_Gajsek, title = {Multi-modal Emotion Recognition based on the Decoupling of Emotion and Speaker Information}, author = {Rok Gaj\v{s}ek and Vitomir \v{S}truc and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/TSDEmo.pdf}, year = {2010}, date = {2010-01-01}, booktitle = {Proceedings of Text, Speech and Dialogue (TSD)}, volume = {6231/2010}, pages = {275-282}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, series = {Lecture Notes on Computer Science}, abstract = {The standard features used in emotion recognition carry, besides the emotion related information, also cues about the speaker. This is expected, since the nature of emotionally colored speech is similar to the variations in the speech signal, caused by different speakers. Therefore, we present a gradient descent derived transformation for the decoupling of emotion and speaker information contained in the acoustic features. The Interspeech ’09 Emotion Challenge feature set is used as the baseline for the audio part. A similar procedure is employed on the video signal, where the nuisance attribute projection (NAP) is used to derive the transformation matrix, which contains information about the emotional state of the speaker. Ultimately, different NAP transformation matrices are compared using canonical correlations. The audio and video sub-systems are combined at the matching score level using different fusion techniques. The presented system is assessed on the publicly available eNTERFACE’05 database where significant improvements in the recognition performance are observed when compared to the stat-of-the-art baseline. }, keywords = {}, pubstate = {published}, tppubtype = {conference} } The standard features used in emotion recognition carry, besides the emotion related information, also cues about the speaker. This is expected, since the nature of emotionally colored speech is similar to the variations in the speech signal, caused by different speakers. Therefore, we present a gradient descent derived transformation for the decoupling of emotion and speaker information contained in the acoustic features. The Interspeech ’09 Emotion Challenge feature set is used as the baseline for the audio part. A similar procedure is employed on the video signal, where the nuisance attribute projection (NAP) is used to derive the transformation matrix, which contains information about the emotional state of the speaker. Ultimately, different NAP transformation matrices are compared using canonical correlations. The audio and video sub-systems are combined at the matching score level using different fusion techniques. The presented system is assessed on the publicly available eNTERFACE’05 database where significant improvements in the recognition performance are observed when compared to the stat-of-the-art baseline. |
2009 |
Rok Gajšek; Vitomir Štruc; Simon Dobrišek; France Mihelič Emotion recognition using linear transformations in combination with video Conference Speech and intelligence: proceedings of Interspeech 2009, Brighton, UK, 2009. @conference{InterSp2009, title = {Emotion recognition using linear transformations in combination with video}, author = {Rok Gaj\v{s}ek and Vitomir \v{S}truc and Simon Dobri\v{s}ek and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/InSP.pdf}, year = {2009}, date = {2009-09-01}, booktitle = {Speech and intelligence: proceedings of Interspeech 2009}, pages = {1967-1970}, address = {Brighton, UK}, abstract = {The paper discuses the usage of linear transformations of Hidden Markov Models, normally employed for speaker and environment adaptation, as a way of extracting the emotional components from the speech. A constrained version of Maximum Likelihood Linear Regression (CMLLR) transformation is used as a feature for classification of normal or aroused emotional state. We present a procedure of incrementally building a set of speaker independent acoustic models, that are used to estimate the CMLLR transformations for emotion classification. An audio-video database of spontaneous emotions (AvID) is briefly presented since it forms the basis for the evaluation of the proposed method. Emotion classification using the video part of the database is also described and the added value of combining the visual information with the audio features is shown.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The paper discuses the usage of linear transformations of Hidden Markov Models, normally employed for speaker and environment adaptation, as a way of extracting the emotional components from the speech. A constrained version of Maximum Likelihood Linear Regression (CMLLR) transformation is used as a feature for classification of normal or aroused emotional state. We present a procedure of incrementally building a set of speaker independent acoustic models, that are used to estimate the CMLLR transformations for emotion classification. An audio-video database of spontaneous emotions (AvID) is briefly presented since it forms the basis for the evaluation of the proposed method. Emotion classification using the video part of the database is also described and the added value of combining the visual information with the audio features is shown. |
Rok Gajšek; Vitomir Štruc; France Mihelič; Anja Podlesek; Luka Komidar; Gregor Sočan; Boštjan Bajec Multi-modal emotional database: AvID Journal Article In: Informatica (Ljubljana), 33 (1), pp. 101-106, 2009. @article{Inform-Gajsek_2009, title = {Multi-modal emotional database: AvID}, author = {Rok Gaj\v{s}ek and Vitomir \v{S}truc and France Miheli\v{c} and Anja Podlesek and Luka Komidar and Gregor So\v{c}an and Bo\v{s}tjan Bajec}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/avid.pdf}, year = {2009}, date = {2009-01-01}, journal = {Informatica (Ljubljana)}, volume = {33}, number = {1}, pages = {101-106}, abstract = {This paper presents our work on recording a multi-modal database containing emotional audio and video recordings. In designing the recording strategies a special attention was payed to gather data involving spontaneous emotions and therefore obtain a more realistic training and testing conditions for experiments. With specially planned scenarios including playing computer games and conducting an adaptive intelligence test different levels of arousal were induced. This will enable us to both detect different emotional states as well as experiment in speaker identification/verification of people involved in communications. So far the multi-modal database has been recorded and basic evaluation of the data was processed.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper presents our work on recording a multi-modal database containing emotional audio and video recordings. In designing the recording strategies a special attention was payed to gather data involving spontaneous emotions and therefore obtain a more realistic training and testing conditions for experiments. With specially planned scenarios including playing computer games and conducting an adaptive intelligence test different levels of arousal were induced. This will enable us to both detect different emotional states as well as experiment in speaker identification/verification of people involved in communications. So far the multi-modal database has been recorded and basic evaluation of the data was processed. |
Vitomir Štruc; Rok Gajšek; France Mihelič; Nikola Pavešić Using regression techniques for coping with the one-sample-size problem of face recognition Journal Article In: Electrotechnical Review, 76 (1-2), pp. 7-12, 2009. @article{EV-Struc_2009, title = {Using regression techniques for coping with the one-sample-size problem of face recognition}, author = {Vitomir \v{S}truc and Rok Gaj\v{s}ek and France Miheli\v{c} and Nikola Pave\v{s}i\'{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/EV2008.pdf}, year = {2009}, date = {2009-01-01}, journal = {Electrotechnical Review}, volume = {76}, number = {1-2}, pages = {7-12}, abstract = {There is a number of face recognition paradigms which ensure good recognition rates with frontal face images. However, the majority of them require an extensive training set and degrade in their performance when an insufficient number of training images is available. This is especially true for applications where only one image per subject is at hand for training. To cope with this one-sample-size (OSS) problem, we propose to employ subspace projection based regression techniques rather than modifications of the established face recognition paradigms, such as the principal component or linear discriminant analysis, as it was done in the past. Experiments performed on the XM2VTS and ORL databases show the effectiveness of the proposed approach. Also presented is a comparative assessment of several regression techniques and some popular face recognition methods. }, keywords = {}, pubstate = {published}, tppubtype = {article} } There is a number of face recognition paradigms which ensure good recognition rates with frontal face images. However, the majority of them require an extensive training set and degrade in their performance when an insufficient number of training images is available. This is especially true for applications where only one image per subject is at hand for training. To cope with this one-sample-size (OSS) problem, we propose to employ subspace projection based regression techniques rather than modifications of the established face recognition paradigms, such as the principal component or linear discriminant analysis, as it was done in the past. Experiments performed on the XM2VTS and ORL databases show the effectiveness of the proposed approach. Also presented is a comparative assessment of several regression techniques and some popular face recognition methods. |
Rok Gajšek; Vitomir Štruc; Simon Dobrišek; Janez Žibert; France Mihelič; Nikola Pavešić Combining audio and video for detection of spontaneous emotions Conference Biometric ID management and multimodal communication, 5707 , Lecture Notes on Computer Science Springer-Verlag, Berlin, Heidelberg, 2009. @conference{BioID_Multi2009b, title = {Combining audio and video for detection of spontaneous emotions}, author = {Rok Gaj\v{s}ek and Vitomir \v{S}truc and Simon Dobri\v{s}ek and Janez \v{Z}ibert and France Miheli\v{c} and Nikola Pave\v{s}i\'{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/BioID_R.pdf}, year = {2009}, date = {2009-01-01}, booktitle = {Biometric ID management and multimodal communication}, volume = {5707}, pages = {114-121}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, series = {Lecture Notes on Computer Science}, abstract = {The paper presents our initial attempts in building an audio video emotion recognition system. Both, audio and video sub-systems are discussed, and description of the database of spontaneous emotions is given. The task of labelling the recordings from the database according to different emotions is discussed and the measured agreement between multiple annotators is presented. Instead of focusing on the prosody in audio emotion recognition, we evaluate the possibility of using linear transformations (CMLLR) as features. The classification results from audio and video sub-systems are combined using sum rule fusion and the increase in recognition results, when using both modalities, is presented.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The paper presents our initial attempts in building an audio video emotion recognition system. Both, audio and video sub-systems are discussed, and description of the database of spontaneous emotions is given. The task of labelling the recordings from the database according to different emotions is discussed and the measured agreement between multiple annotators is presented. Instead of focusing on the prosody in audio emotion recognition, we evaluate the possibility of using linear transformations (CMLLR) as features. The classification results from audio and video sub-systems are combined using sum rule fusion and the increase in recognition results, when using both modalities, is presented. |
Rok Gajšek; Vitomir Štruc; Boštjan Vesnicer; Anja Podlesek; Luka Komidar; France Mihelič Analysis and assessment of AvID: multi-modal emotional database Conference Text, speech and dialogue / 12th International Conference, 5729 , Lecture Notes on Computer Science Springer-Verlag, Berlin, Heidelberg, 2009. @conference{TSD2009, title = {Analysis and assessment of AvID: multi-modal emotional database}, author = {Rok Gaj\v{s}ek and Vitomir \v{S}truc and Bo\v{s}tjan Vesnicer and Anja Podlesek and Luka Komidar and France Miheli\v{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/TSD.pdf}, year = {2009}, date = {2009-01-01}, booktitle = {Text, speech and dialogue / 12th International Conference}, volume = {5729}, pages = {266-273}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, series = {Lecture Notes on Computer Science}, abstract = {The paper deals with the recording and the evaluation of a multi modal (audio/video) database of spontaneous emotions. Firstly, motivation for this work is given and different recording strategies used are described. Special attention is given to the process of evaluating the emotional database. Different kappa statistics normally used in measuring the agreement between annotators are discussed. Following the problems of standard kappa coefficients, when used in emotional database assessment, a new time-weighted free-marginal kappa is presented. It differs from the other kappa statistics in that it weights each utterance's particular score of agreement based on the duration of the utterance. The new method is evaluated and the superiority over the standard kappa, when dealing with a database of spontaneous emotions, is demonstrated.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The paper deals with the recording and the evaluation of a multi modal (audio/video) database of spontaneous emotions. Firstly, motivation for this work is given and different recording strategies used are described. Special attention is given to the process of evaluating the emotional database. Different kappa statistics normally used in measuring the agreement between annotators are discussed. Following the problems of standard kappa coefficients, when used in emotional database assessment, a new time-weighted free-marginal kappa is presented. It differs from the other kappa statistics in that it weights each utterance's particular score of agreement based on the duration of the utterance. The new method is evaluated and the superiority over the standard kappa, when dealing with a database of spontaneous emotions, is demonstrated. |
2008 |
Vitomir Štruc; France Mihelič; Rok Gajšek; Nikola Pavešić Regression techniques versus discriminative methods for face recognition Inproceedings In: Proceedings of the 9th international PhD Workshop on Systems and Control, pp. 1-5, Izola, Slovenia, 2008. @inproceedings{PHD2008, title = {Regression techniques versus discriminative methods for face recognition}, author = {Vitomir \v{S}truc and France Miheli\v{c} and Rok Gaj\v{s}ek and Nikola Pave\v{s}i\'{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/IZOLA.pdf}, year = {2008}, date = {2008-10-01}, booktitle = {Proceedings of the 9th international PhD Workshop on Systems and Control}, pages = {1-5}, address = {Izola, Slovenia}, abstract = {In the field of face recognition it is generally believed that ”state of the art” recognition rates can only be achieved when discriminative (e.g., linear or generalized discriminant analysis) rather than expressive (e.g., principal or kernel principal component analysis) methods are used for facial feature extraction. However, while being superior in terms of the recognition rates, the discriminative techniques still exhibit some shortcomings when compared to the expressive approaches. More specifically, they suffer from the so-called small sample size (SSS) problem which is regularly encountered in the field of face recognition and occurs when the sample dimensionality is larger than the number of available training samples per subject. In this type of problems, the discriminative techniques need modifications in order to be feasible, but even in their most elaborate forms require at least two training samples per subject. The expressive approaches, on the other hand, are not susceptible to the SSS problem and are thus applicable even in the most extreme case of the small sample size problem, i.e., when only one training sample per subject is available. Nevertheless, in this paper we will show that the recognition performance of the expressive methods can match (or in some cases surpass) that of the discriminative techniques if the expressive feature extraction approaches are used as multivariate regression techniques with a pre-designed response matrix that encodes the class membership of the training samples. The effectiveness of the regression techniques for face recognition is demonstrated in a series of experiments performed on the ORL database. Additionally a comparative assessment of the regression techniques and popular discriminative approaches is presented.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } In the field of face recognition it is generally believed that ”state of the art” recognition rates can only be achieved when discriminative (e.g., linear or generalized discriminant analysis) rather than expressive (e.g., principal or kernel principal component analysis) methods are used for facial feature extraction. However, while being superior in terms of the recognition rates, the discriminative techniques still exhibit some shortcomings when compared to the expressive approaches. More specifically, they suffer from the so-called small sample size (SSS) problem which is regularly encountered in the field of face recognition and occurs when the sample dimensionality is larger than the number of available training samples per subject. In this type of problems, the discriminative techniques need modifications in order to be feasible, but even in their most elaborate forms require at least two training samples per subject. The expressive approaches, on the other hand, are not susceptible to the SSS problem and are thus applicable even in the most extreme case of the small sample size problem, i.e., when only one training sample per subject is available. Nevertheless, in this paper we will show that the recognition performance of the expressive methods can match (or in some cases surpass) that of the discriminative techniques if the expressive feature extraction approaches are used as multivariate regression techniques with a pre-designed response matrix that encodes the class membership of the training samples. The effectiveness of the regression techniques for face recognition is demonstrated in a series of experiments performed on the ORL database. Additionally a comparative assessment of the regression techniques and popular discriminative approaches is presented. |
Vitomir Štruc; France Mihelič; Nikola Pavešić Combining experts for improved face verification performance Inproceedings In: Proceedings of the IEEE International Electrotechnical and Computer Science Conference (ERK'08), pp. 233-236, Portorož, Slovenia, 2008. @inproceedings{ERK2008, title = {Combining experts for improved face verification performance}, author = {Vitomir \v{S}truc and France Miheli\v{c} and Nikola Pave\v{s}i\'{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/ERK2008.pdf}, year = {2008}, date = {2008-09-01}, booktitle = {Proceedings of the IEEE International Electrotechnical and Computer Science Conference (ERK'08)}, pages = {233-236}, address = {Portoro\v{z}, Slovenia}, abstract = {Samodejno razpoznavanje (avtentikacija/identifikacija) obrazov predstavlja eno najaktivnej\v{s}ih raziskovalnih podro\v{c}ij biometrije. Avtentikacija oz. identifikacija oseb z razpoznavanjem obrazov ponuja mo\v{z}en na\v{c}in pove\v{c}anja varnosti pri razli\v{c}nih dejavnostih, (npr. pri elektronskem poslovanju na medmre\v{z}ju, pri ban\v{c}nih storitvah ali pri vstopu v dolo\v{c}ene prostore, stavbe in dr\v{z}ave). Ponuja univerzalen in nevsiljiv na\v{c}in razpoznavanja oseb, ki pa trenutno \v{s}e ni dovolj zanesljiv. Kot mo\v{z}na re\v{s}itev problema zanesljivosti razpoznavanja se v literaturi vse pogosteje pojavljajo ve\v{c}modalni pristopi, v katerih se razpoznavanje izvede na podlagi ve\v{c}jega \v{s}tevila postopkov razpoznavanja obrazov. V skladu z opisanim trendom, bomo v \v{c}lanku ovrednotili zanesljivost delovanja razli\v{c}nih postopkov razpoznavanja obrazov, ki jih bomo na koncu zdru\v{z}ili \v{s}e v ve\v{c}modalni pristop. S pomo\v{c}jo eksperimentov na podatkovni zbirki XM2VTS bomo preverili zanesljivost delovanja ve\v{c}modalnega pristopa in jo primerjali z zanesljivostjo uveljavljenih postopkov razpoznavanja.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Samodejno razpoznavanje (avtentikacija/identifikacija) obrazov predstavlja eno najaktivnejših raziskovalnih področij biometrije. Avtentikacija oz. identifikacija oseb z razpoznavanjem obrazov ponuja možen način povečanja varnosti pri različnih dejavnostih, (npr. pri elektronskem poslovanju na medmrežju, pri bančnih storitvah ali pri vstopu v določene prostore, stavbe in države). Ponuja univerzalen in nevsiljiv način razpoznavanja oseb, ki pa trenutno še ni dovolj zanesljiv. Kot možna rešitev problema zanesljivosti razpoznavanja se v literaturi vse pogosteje pojavljajo večmodalni pristopi, v katerih se razpoznavanje izvede na podlagi večjega števila postopkov razpoznavanja obrazov. V skladu z opisanim trendom, bomo v članku ovrednotili zanesljivost delovanja različnih postopkov razpoznavanja obrazov, ki jih bomo na koncu združili še v večmodalni pristop. S pomočjo eksperimentov na podatkovni zbirki XM2VTS bomo preverili zanesljivost delovanja večmodalnega pristopa in jo primerjali z zanesljivostjo uveljavljenih postopkov razpoznavanja. |
Vitomir Štruc; France Mihelič; Nikola Pavešić Face authentication using a hybrid approach Journal Article In: Journal of Electronic Imaging, 17 (1), pp. 1-11, 2008. @article{JEI-Struc_2008, title = {Face authentication using a hybrid approach}, author = {Vitomir \v{S}truc and France Miheli\v{c} and Nikola Pave\v{s}i\'{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2016/09/JEI.pdf}, doi = {10.1117/1.2885149}, year = {2008}, date = {2008-01-01}, journal = {Journal of Electronic Imaging}, volume = {17}, number = {1}, pages = {1-11}, abstract = {This paper presents a hybrid approach to face-feature extraction based on the trace transform and the novel kernel partial-least-squares discriminant analysis (KPA). The hybrid approach, called trace kernel partial-least-squares discriminant analysis (TKPA) first uses a set of fifteen trace functionals to derive robust and discriminative facial features and then applies the KPA method to reduce their dimensionality. The feasibility of the proposed approach was successfully tested on the XM2VTS database, where a false rejection rate (FRR) of 1.25% and a false acceptance rate (FAR) of 2.11% were achieved in our best-performing face-authentication experiment. The experimental results also show that the proposed approach can outperform kernel methods such as generalized discriminant analysis (GDA), kernel fisher analysis (KFA) and complete kernel fisher discriminant analysis (CKFA) as well as combinations of these methods with features extracted using the trace transform.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper presents a hybrid approach to face-feature extraction based on the trace transform and the novel kernel partial-least-squares discriminant analysis (KPA). The hybrid approach, called trace kernel partial-least-squares discriminant analysis (TKPA) first uses a set of fifteen trace functionals to derive robust and discriminative facial features and then applies the KPA method to reduce their dimensionality. The feasibility of the proposed approach was successfully tested on the XM2VTS database, where a false rejection rate (FRR) of 1.25% and a false acceptance rate (FAR) of 2.11% were achieved in our best-performing face-authentication experiment. The experimental results also show that the proposed approach can outperform kernel methods such as generalized discriminant analysis (GDA), kernel fisher analysis (KFA) and complete kernel fisher discriminant analysis (CKFA) as well as combinations of these methods with features extracted using the trace transform. |
Rok Gajšek; Anja Podlesek; Luka Komidar; Grekor Sočan; Boštjan Bajec; Vitomir Štruc; Valentin Bucik; France Mihelič AvID: audio-video emotional database Inproceedings In: Proceedings of the 11th International Multi-conference Information Society (IS'08), pp. 70-74, Ljubljana, Slovenia, 2008. @inproceedings{JJ2008, title = {AvID: audio-video emotional database}, author = {Rok Gaj\v{s}ek and Anja Podlesek and Luka Komidar and Grekor So\v{c}an and Bo\v{s}tjan Bajec and Vitomir \v{S}truc and Valentin Bucik and France Miheli\v{c}}, year = {2008}, date = {2008-01-01}, booktitle = {Proceedings of the 11th International Multi-conference Information Society (IS'08)}, volume = {C}, pages = {70-74}, address = {Ljubljana, Slovenia}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
2007 |
Vitomir Štruc, France Mihelič, Nikola Pavešić Color spaces for face recognition Inproceedings In: Proceedings of the International Electrotechnical and Computer Science Conference (ERK'07), pp. 171-174, Portorož, Slovenia, 2007. @inproceedings{ERK2007, title = {Color spaces for face recognition}, author = {Vitomir \v{S}truc, France Miheli\v{c}, Nikola Pave\v{s}i\'{c}}, url = {http://luks.fe.uni-lj.si/nluks/wp-content/uploads/2014/08/ERK2007.pdf}, year = {2007}, date = {2007-01-01}, booktitle = {Proceedings of the International Electrotechnical and Computer Science Conference (ERK'07)}, pages = {171-174}, address = {Portoro\v{z}, Slovenia}, abstract = {The paper investigates the impact that the face-image color space has on the verification performance of two popular face recognition procedures, i.e., the Fisherface approach and the Gabor-Fisher classifier - GFC. Experimental results on the XM2VTS database show that the Fisherface technique performs best when features are extracted from the Cr component of the YCbCr color space, while the performance of the Gabor-Fisher classifier is optimized when grey-scale intensity face-images are used for feature extraction. Based on these findings, a novel face recognition framework that combines the Fisherface and the GFC method is introduced in this paper and its feasibility demonstrated in a comparative study where, in addition to the proposed method, six widely used feature extraction techniques were tested for their face verification performance.}, key = {ERK2007}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The paper investigates the impact that the face-image color space has on the verification performance of two popular face recognition procedures, i.e., the Fisherface approach and the Gabor-Fisher classifier - GFC. Experimental results on the XM2VTS database show that the Fisherface technique performs best when features are extracted from the Cr component of the YCbCr color space, while the performance of the Gabor-Fisher classifier is optimized when grey-scale intensity face-images are used for feature extraction. Based on these findings, a novel face recognition framework that combines the Fisherface and the GFC method is introduced in this paper and its feasibility demonstrated in a comparative study where, in addition to the proposed method, six widely used feature extraction techniques were tested for their face verification performance. |
Objave
2016 |
Facial Landmark Localization from 3D Images Inproceedings In: Proceedings of the Electrotechnical and Computer Science Conference (ERK), Portorož, Slovenia, 2016. |
Assessment of the Google Speech Application Programming Interface for Automatic Slovenian Speech Recognition Inproceedings In: Jezikovne Tehnologije in Digitalna Humanistika, 2016. |
2015 |
Speaker de-identification using diphone recognition and speech synthesis Conference 11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (IEEE FG): DeID 2015, 4 , IEEE 2015. |
Face recognition in the wild with the Probabilistic Gabor-Fisher Classifier Conference 11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (IEEE FG): BWild 2015, 2 , IEEE 2015. |
Development and Evaluation of the Emotional Slovenian Speech Database-EmoLUKS Conference Proceedings of the International Conference on Text, Speech, and Dialogue (TSD), Springer 2015. |
2014 |
A Feasibility Study on the Use of Binary Keypoint Descriptors for 3D Face Recognition Conference Proceedings of the Mexican Conference on Pattern Recognition (MCPR), Springer 2014. |
2013 |
Towards efficient multi-modal emotion recognition Journal Article In: International Journal of Advanced Robotic Systems, 10 (53), 2013. |
2010 |
Removing Illumination Artifacts from Face Images using the Nuisance Attribute Projection Conference Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP'10), IEEE, Dallas, Texas, USA, 2010. |
Multi-modal Emotion Recognition using Canonical Correlations and Acustic Features Conference Proceedings of the International Conference on Pattern Recognition (ICPR), IAPR Istanbul, Turkey, 2010. |
Multi-modal Emotion Recognition based on the Decoupling of Emotion and Speaker Information Conference Proceedings of Text, Speech and Dialogue (TSD), 6231/2010 , Lecture Notes on Computer Science Springer-Verlag, Berlin, Heidelberg, 2010. |
2009 |
Emotion recognition using linear transformations in combination with video Conference Speech and intelligence: proceedings of Interspeech 2009, Brighton, UK, 2009. |
Multi-modal emotional database: AvID Journal Article In: Informatica (Ljubljana), 33 (1), pp. 101-106, 2009. |
Using regression techniques for coping with the one-sample-size problem of face recognition Journal Article In: Electrotechnical Review, 76 (1-2), pp. 7-12, 2009. |
Combining audio and video for detection of spontaneous emotions Conference Biometric ID management and multimodal communication, 5707 , Lecture Notes on Computer Science Springer-Verlag, Berlin, Heidelberg, 2009. |
Analysis and assessment of AvID: multi-modal emotional database Conference Text, speech and dialogue / 12th International Conference, 5729 , Lecture Notes on Computer Science Springer-Verlag, Berlin, Heidelberg, 2009. |
2008 |
Regression techniques versus discriminative methods for face recognition Inproceedings In: Proceedings of the 9th international PhD Workshop on Systems and Control, pp. 1-5, Izola, Slovenia, 2008. |
Combining experts for improved face verification performance Inproceedings In: Proceedings of the IEEE International Electrotechnical and Computer Science Conference (ERK'08), pp. 233-236, Portorož, Slovenia, 2008. |
Face authentication using a hybrid approach Journal Article In: Journal of Electronic Imaging, 17 (1), pp. 1-11, 2008. |
AvID: audio-video emotional database Inproceedings In: Proceedings of the 11th International Multi-conference Information Society (IS'08), pp. 70-74, Ljubljana, Slovenia, 2008. |
2007 |
Color spaces for face recognition Inproceedings In: Proceedings of the International Electrotechnical and Computer Science Conference (ERK'07), pp. 171-174, Portorož, Slovenia, 2007. |