{"title":"CLIP-Font: Sementic Self-Supervised Few-Shot Font Generation with Clip","authors":"Jialu Xiong, Yefei Wang, Jinshan Zeng","doi":"10.1109/icassp48485.2024.10447490","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10447490","url":null,"abstract":"","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"20 4","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140706483","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Elevating Visual Prompting in Transfer Learning Via Pruned Model Ensembles: No Retrain, No Pain","authors":"Brian Zhang, Yuguang Yao, Sijia Liu","doi":"10.1109/icassp48485.2024.10447808","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10447808","url":null,"abstract":"","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"156 5","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140706988","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Binauralmusic: A Diverse Dataset for Improving Cross-Modal Binaural Audio Generation","authors":"Yunqi Li, Shulin Liu, Haonan Cheng, Long Ye","doi":"10.1109/icassp48485.2024.10448509","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10448509","url":null,"abstract":"","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"67 S10","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140705089","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Seungheon Doh, Minhee Lee, Dasaem Jeong, Juhan Nam
{"title":"Enriching Music Descriptions with A Finetuned-LLM and Metadata for Text-to-Music Retrieval","authors":"Seungheon Doh, Minhee Lee, Dasaem Jeong, Juhan Nam","doi":"10.1109/icassp48485.2024.10446380","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10446380","url":null,"abstract":"","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"88 2","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140705865","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Dynamic Speech Emotion Recognition Using A Conditional Neural Process","authors":"Luz Martinez-Lucas, Carlos Busso","doi":"10.1109/icassp48485.2024.10447805","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10447805","url":null,"abstract":"The problem of predicting emotional attributes from speech has often focused on predicting a single value from a sentence or short speaking turn. These methods often ignore that natural emotions are both dynamic and dependent on context. To model the dynamic nature of emotions, we can treat the prediction of emotion from speech as a time-series problem. We refer to the problem of predicting these emotional traces as dynamic speech emotion recognition. Previous studies in this area have used models that treat all emotional traces as coming from the same underlying distribution. Since emotions are dependent on contextual information, these methods might obscure the context of an emotional interaction. This paper uses a neural process model with a segment-level speech emotion recognition (SER) model for this problem. This type of model leverages information from the time-series and predictions from the SER model to learn a prior that defines a distribution over emotional traces. Our proposed model performs 21% better than a bidirectional long short-term memory (BiLSTM) baseline when predicting emotional traces for valence.","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"9 10","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140705349","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Oluwasegun Ayokunle Somefun, Stefan Lee, V. J. Mathews
{"title":"AUTOSGM: A Unified Lowpass Regularization Framework for Accelerated Learning","authors":"Oluwasegun Ayokunle Somefun, Stefan Lee, V. J. Mathews","doi":"10.1109/icassp48485.2024.10448203","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10448203","url":null,"abstract":"","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"49 2","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140705776","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Sara El Bouch, J. Galy, É. Chaumette, J. Vilà‐Valls
{"title":"A Modified Cramér-Rao Bound for Discrete-Time Markovian Dynamic Systems","authors":"Sara El Bouch, J. Galy, É. Chaumette, J. Vilà‐Valls","doi":"10.1109/icassp48485.2024.10446252","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10446252","url":null,"abstract":"","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"86 1","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140705874","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Trades++: Enhancing Multi-Object Tracking of Real Low Confidence Targets Using a Pyramid-Like Self-Attention Model","authors":"Chenxin Wen, Yanlei Gao, Jie Li","doi":"10.1109/icassp48485.2024.10446257","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10446257","url":null,"abstract":"","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"236 1","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140703885","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Maskstr: Guide Scene Text Recognition Models with Masking","authors":"Baole Wei, Minghang He, Liangcai Gao, Duoyou Zhou, Xiang Bai, Zhi Tang","doi":"10.1109/icassp48485.2024.10446874","DOIUrl":"https://doi.org/10.1109/icassp48485.2024.10446874","url":null,"abstract":"","PeriodicalId":517764,"journal":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"220 6","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-04-14","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"140704631","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}