{"title":"An empirical study on analysis window functions for text-independent speaker recognition","authors":"Bidhan Barai, N. Das, Subhadip Basu, M. Nasipuri","doi":"10.1007/s10772-023-10024-1","DOIUrl":"https://doi.org/10.1007/s10772-023-10024-1","url":null,"abstract":"","PeriodicalId":14305,"journal":{"name":"International Journal of Speech Technology","volume":"26 1","pages":"211-220"},"PeriodicalIF":0.0,"publicationDate":"2023-02-19","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"41522450","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Weibull and Nakagami speech priors based regularized NMF with adaptive wiener filter for speech enhancement","authors":"Chaitanya Jannu, S. Vanambathina","doi":"10.1007/s10772-023-10020-5","DOIUrl":"https://doi.org/10.1007/s10772-023-10020-5","url":null,"abstract":"","PeriodicalId":14305,"journal":{"name":"International Journal of Speech Technology","volume":"26 1","pages":"197-209"},"PeriodicalIF":0.0,"publicationDate":"2023-02-02","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"45911020","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Speaker identification and localization using shuffled MFCC features and deep learning","authors":"Mahdi Barhoush, Ahmed Hallawa, A. Schmeink","doi":"10.1007/s10772-023-10023-2","DOIUrl":"https://doi.org/10.1007/s10772-023-10023-2","url":null,"abstract":"","PeriodicalId":14305,"journal":{"name":"International Journal of Speech Technology","volume":"26 1","pages":"185-196"},"PeriodicalIF":0.0,"publicationDate":"2023-01-29","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"47692961","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Haihua Jiang, Bin Hu, Zhenyu Liu, G. Wang, Lan Zhang
{"title":"A radius-incorporated localized multiple kernel learning algorithm for detecting depression in speech","authors":"Haihua Jiang, Bin Hu, Zhenyu Liu, G. Wang, Lan Zhang","doi":"10.1007/s10772-023-10017-0","DOIUrl":"https://doi.org/10.1007/s10772-023-10017-0","url":null,"abstract":"","PeriodicalId":14305,"journal":{"name":"International Journal of Speech Technology","volume":"1 1","pages":"1-8"},"PeriodicalIF":0.0,"publicationDate":"2023-01-23","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"44669858","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"An automated speech analysis system for the detection of cognitive decline in elderly","authors":"C. Loizou, M. Pantzaris","doi":"10.1007/s10772-023-10016-1","DOIUrl":"https://doi.org/10.1007/s10772-023-10016-1","url":null,"abstract":"","PeriodicalId":14305,"journal":{"name":"International Journal of Speech Technology","volume":"1 1","pages":"1-17"},"PeriodicalIF":0.0,"publicationDate":"2023-01-19","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"43542889","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Shubam Sachdeva, Haoyao Ruan, Ghassan Hamarneh, Dawn M Behne, Allard Jongman, Joan A Sereno, Yue Wang
{"title":"Plain-to-clear speech video conversion for enhanced intelligibility.","authors":"Shubam Sachdeva, Haoyao Ruan, Ghassan Hamarneh, Dawn M Behne, Allard Jongman, Joan A Sereno, Yue Wang","doi":"10.1007/s10772-023-10018-z","DOIUrl":"https://doi.org/10.1007/s10772-023-10018-z","url":null,"abstract":"<p><p>Clearly articulated speech, relative to plain-style speech, has been shown to improve intelligibility. We examine if visible speech cues in video only can be systematically modified to enhance clear-speech visual features and improve intelligibility. We extract clear-speech visual features of English words varying in vowels produced by multiple male and female talkers. Via a frame-by-frame image-warping based video generation method with a controllable parameter (displacement factor), we apply the extracted clear-speech visual features to videos of plain speech to synthesize clear speech videos. We evaluate the generated videos using a robust, state of the art AI Lip Reader as well as human intelligibility testing. The contributions of this study are: (1) we successfully extract relevant visual cues for video modifications across speech styles, and have achieved enhanced intelligibility for AI; (2) this work suggests that universal talker-independent clear-speech features may be utilized to modify any talker's visual speech style; (3) we introduce \"displacement factor\" as a way of systematically scaling the magnitude of displacement modifications between speech styles; and (4) the high definition generated videos make them ideal candidates for human-centric intelligibility and perceptual training studies.</p>","PeriodicalId":14305,"journal":{"name":"International Journal of Speech Technology","volume":"26 1","pages":"163-184"},"PeriodicalIF":0.0,"publicationDate":"2023-01-01","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10042924/pdf/","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"9611085","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"OA","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Retraction Note: Preserving learnability and intelligibility at the point of care with assimilation of different speech recognition techniques","authors":"Sukumar Rajendran, P. Jayagopal","doi":"10.1007/s10772-022-09996-3","DOIUrl":"https://doi.org/10.1007/s10772-022-09996-3","url":null,"abstract":"","PeriodicalId":14305,"journal":{"name":"International Journal of Speech Technology","volume":"25 1","pages":"39"},"PeriodicalIF":0.0,"publicationDate":"2022-12-01","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"52286102","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}