Hongyang Du, Ruichen Zhang, Dusit Niyato, Jiawen Kang, Zehui Xiong, Shuguang Cui, Xuemin Shen, Dong In Kim
{"title":"Reinforcement Learning With LLMs Interaction For Distributed Diffusion Model Services","authors":"Hongyang Du, Ruichen Zhang, Dusit Niyato, Jiawen Kang, Zehui Xiong, Shuguang Cui, Xuemin Shen, Dong In Kim","doi":"10.1109/tpami.2025.3584698","DOIUrl":"https://doi.org/10.1109/tpami.2025.3584698","url":null,"abstract":"","PeriodicalId":13426,"journal":{"name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","volume":"633 1","pages":""},"PeriodicalIF":23.6,"publicationDate":"2025-06-30","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"144520687","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":1,"RegionCategory":"计算机科学","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Ji Zhang, Jingkuan Song, Lianli Gao, Nicu Sebe, Heng Tao Shen
{"title":"Reliable Few-shot Learning under Dual Noises","authors":"Ji Zhang, Jingkuan Song, Lianli Gao, Nicu Sebe, Heng Tao Shen","doi":"10.1109/tpami.2025.3584051","DOIUrl":"https://doi.org/10.1109/tpami.2025.3584051","url":null,"abstract":"","PeriodicalId":13426,"journal":{"name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","volume":"74 1","pages":""},"PeriodicalIF":23.6,"publicationDate":"2025-06-30","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"144520674","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":1,"RegionCategory":"计算机科学","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Revisiting Essential and Nonessential Settings of Evidential Deep Learning","authors":"Mengyuan Chen, Junyu Gao, Changsheng Xu","doi":"10.1109/tpami.2025.3583410","DOIUrl":"https://doi.org/10.1109/tpami.2025.3583410","url":null,"abstract":"","PeriodicalId":13426,"journal":{"name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","volume":"26 1","pages":""},"PeriodicalIF":23.6,"publicationDate":"2025-06-26","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"144500692","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":1,"RegionCategory":"计算机科学","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Qilang Ye,Zitong Yu,Rui Shao,Yawen Cui,Xiangui Kang,Xin Liu,Philip Torr,Xiaochun Cao
{"title":"CAT+: Investigating and Enhancing Audio-visual Understanding in Large Language Models.","authors":"Qilang Ye,Zitong Yu,Rui Shao,Yawen Cui,Xiangui Kang,Xin Liu,Philip Torr,Xiaochun Cao","doi":"10.1109/tpami.2025.3582389","DOIUrl":"https://doi.org/10.1109/tpami.2025.3582389","url":null,"abstract":"Multimodal Large Language Models (MLLMs) have gained significant attention due to their rich internal implicit knowledge for cross-modal learning. Although advances in bringing audio-visuals into LLMs have resulted in boosts for a variety of Audio-Visual Question Answering (AVQA) tasks, they still face two crucial challenges: 1) audio-visual ambiguity, and 2) audio-visual hallucination. Existing MLLMs can respond to audio-visual content, yet sometimes fail to describe specific objects due to the ambiguity or hallucination of responses. To overcome the two aforementioned issues, we introduce the CAT+, which enhances MLLM to ensure more robust multimodal understanding. We first propose the Sequential Question-guided Module (SQM), which combines tiny transformer layers and cascades Q-Formers to realize a solid audio-visual grounding. After feature alignment and high-quality instruction tuning, we introduce Ambiguity Scoring Direct Preference Optimization (AS-DPO) to correct the problem of CAT+ bias toward ambiguous descriptions. To explore the hallucinatory deficits of MLLMs in dynamic audio-visual scenes, we build a new Audio-visual Hallucination Benchmark, named AVHbench. This benchmark detects the extent of MLLM's hallucinations across three different protocols in the perceptual object, counting, and holistic description tasks. Extensive experiments across video-based understanding, open-ended, and close-ended AVQA demonstrate the superior performance of our method.","PeriodicalId":13426,"journal":{"name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","volume":"26 1","pages":""},"PeriodicalIF":23.6,"publicationDate":"2025-06-25","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"144488043","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":1,"RegionCategory":"计算机科学","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}