S. Bayerl, Dominik Wagner, E. Nöth, T. Bocklet, K. Riedhammer
{"title":"The Influence of Dataset Partitioning on Dysfluency Detection Systems","authors":"S. Bayerl, Dominik Wagner, E. Nöth, T. Bocklet, K. Riedhammer","doi":"10.1007/978-3-031-16270-1_35","DOIUrl":"https://doi.org/10.1007/978-3-031-16270-1_35","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"203 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2022-06-07","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"116626971","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Soumyajit Mitra, Swayambhu Nath Ray, Bharat Padi, Raghavendra Bilgi, Harish Arsikere, Shalini Ghosh, A. Srinivasamurthy, S. Garimella
{"title":"Unified Modeling of Multi-Domain Multi-Device ASR Systems","authors":"Soumyajit Mitra, Swayambhu Nath Ray, Bharat Padi, Raghavendra Bilgi, Harish Arsikere, Shalini Ghosh, A. Srinivasamurthy, S. Garimella","doi":"10.48550/arXiv.2205.06655","DOIUrl":"https://doi.org/10.48550/arXiv.2205.06655","url":null,"abstract":"Modern Automatic Speech Recognition (ASR) systems often use a portfolio of domain-specific models in order to get high accuracy for distinct user utterance types across different devices. In this paper, we propose an innovative approach that integrates the different per-domain per-device models into a unified model, using a combination of domain embedding, domain experts, mixture of experts and adversarial training. We run careful ablation studies to show the benefit of each of these innovations in contributing to the accuracy of the overall unified model. Experiments show that our proposed unified modeling approach actually outperforms the carefully tuned per-domain models, giving relative gains of up to 10% over a baseline model with negligible increase in the number of parameters.","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"44 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2022-05-13","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"128978239","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Linguistic Resources Construction: Towards Disfluency Processing in Spontaneous Tunisian Dialect Speech","authors":"Emna Boughariou, Younés Bahou, Lamia Hadrich Belguith","doi":"10.1007/978-3-030-27947-9_27","DOIUrl":"https://doi.org/10.1007/978-3-030-27947-9_27","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"1 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2019-09-10","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"116075521","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"The TransBank Aligner: Cross-Sentence Alignment with Deep Neural Networks","authors":"Ahmad Aghaebrahimian, M. Ustaszewski, A. Stauder","doi":"10.1007/978-3-030-27947-9_16","DOIUrl":"https://doi.org/10.1007/978-3-030-27947-9_16","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"1 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2019-09-10","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"129387090","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Malo Grisard, P. Motlícek, Wissem Allouchi, Michael Baeriswyl, Alexandros Lazaridis, Qingran Zhan
{"title":"Spoken Language Identification Using Language Bottleneck Features","authors":"Malo Grisard, P. Motlícek, Wissem Allouchi, Michael Baeriswyl, Alexandros Lazaridis, Qingran Zhan","doi":"10.1007/978-3-030-27947-9_32","DOIUrl":"https://doi.org/10.1007/978-3-030-27947-9_32","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"1 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2019-09-10","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"129146319","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Explicit and Implicit Discourse Relations in the Prague Discourse Treebank","authors":"Sárka Zikánová, Jirí Mírovský, Pavlína Synková","doi":"10.1007/978-3-030-27947-9_20","DOIUrl":"https://doi.org/10.1007/978-3-030-27947-9_20","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"53 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2019-09-10","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"122707715","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"On Practical Aspects of Multi-condition Training Based on Augmentation for Reverberation-/Noise-Robust Speech Recognition","authors":"J. Málek, J. Zdánský","doi":"10.1007/978-3-030-27947-9_21","DOIUrl":"https://doi.org/10.1007/978-3-030-27947-9_21","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"19 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2019-09-10","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"127295422","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Question-Answering Dialog System for Large Audiovisual Archives","authors":"Adam Chýlek, L. Smídl, J. Svec","doi":"10.1007/978-3-030-27947-9_33","DOIUrl":"https://doi.org/10.1007/978-3-030-27947-9_33","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"50 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2019-09-10","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"127416607","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"A Comparison of Hybrid and End-to-End Models for Syllable Recognition","authors":"Sebastian P. Bayerl, K. Riedhammer","doi":"10.1007/978-3-030-27947-9_30","DOIUrl":"https://doi.org/10.1007/978-3-030-27947-9_30","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"47 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2019-09-10","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"126869493","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"A Self-organizing Feature Map for Arabic Word Extraction","authors":"Hassina Bouressace, J. Csirik","doi":"10.1007/978-3-030-27947-9_11","DOIUrl":"https://doi.org/10.1007/978-3-030-27947-9_11","url":null,"abstract":"","PeriodicalId":358274,"journal":{"name":"International Conference on Text, Speech and Dialogue","volume":"47 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2019-09-10","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"121970976","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}