{"title":"AshPipe: Asynchronous Hybrid Pipeline Parallel for DNN Training","authors":"Ryubu Hosoki, Toshio Endo, Takahiro Hirofuchi, Tsutomu Ikegami","doi":"10.1145/3635035.3635045","DOIUrl":"https://doi.org/10.1145/3635035.3635045","url":null,"abstract":"","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"66 10","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-01-18","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"139526731","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"QUBO formulation using inequalities for problems with complex constraints","authors":"Tomoko Komiyama, Tomohiro Suzuki","doi":"10.1145/3635035.3635042","DOIUrl":"https://doi.org/10.1145/3635035.3635042","url":null,"abstract":"","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"73 13","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-01-18","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"139526589","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Andres Sewell, Ke Fan, Ahmedur Rahman Shovon, Landon Dyken, Sidharth Kumar, Steve Petruzza
{"title":"Bruck Algorithm Performance Analysis for Multi-GPU All-to-All Communication","authors":"Andres Sewell, Ke Fan, Ahmedur Rahman Shovon, Landon Dyken, Sidharth Kumar, Steve Petruzza","doi":"10.1145/3635035.3635047","DOIUrl":"https://doi.org/10.1145/3635035.3635047","url":null,"abstract":"","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"2 7","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-01-18","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"139526199","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
C. Kodama, H. Yashiro, Takashi Arakawa, Daisuke Takasuka, Shuhei Matsugishi, Hirofumi Tomita
{"title":"Parallelized Remapping Algorithms for km-scale Global Weather and Climate Simulations with Icosahedral Grid System","authors":"C. Kodama, H. Yashiro, Takashi Arakawa, Daisuke Takasuka, Shuhei Matsugishi, Hirofumi Tomita","doi":"10.1145/3635035.3635040","DOIUrl":"https://doi.org/10.1145/3635035.3635040","url":null,"abstract":"","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"80 11","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-01-18","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"139526348","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Bengisu Elis, Olga Pearce, David Boehme, J. Burmark, Martin Schulz
{"title":"Non-Blocking GPU-CPU Notifications to Enable More GPU-CPU Parallelism","authors":"Bengisu Elis, Olga Pearce, David Boehme, J. Burmark, Martin Schulz","doi":"10.1145/3635035.3635036","DOIUrl":"https://doi.org/10.1145/3635035.3635036","url":null,"abstract":"","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"75 4","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-01-18","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"139526527","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Taisei Matsushima, Ken Iwata, Naohisa Sakamoto, J. Nonaka, Chongke Bi
{"title":"Information Entropy-based Camera Focus Point and Zoom Level Adjustment for Smart In-Situ Visualization","authors":"Taisei Matsushima, Ken Iwata, Naohisa Sakamoto, J. Nonaka, Chongke Bi","doi":"10.1145/3635035.3635049","DOIUrl":"https://doi.org/10.1145/3635035.3635049","url":null,"abstract":"","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"87 6","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-01-18","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"139526567","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
Cheng-Hsiang Chiu, Zhicheng Xiong, Zizheng Guo, Tsung-Wei Huang, Yibo Lin
{"title":"An Efficient Task-Parallel Pipeline Programming Framework","authors":"Cheng-Hsiang Chiu, Zhicheng Xiong, Zizheng Guo, Tsung-Wei Huang, Yibo Lin","doi":"10.1145/3635035.3635037","DOIUrl":"https://doi.org/10.1145/3635035.3635037","url":null,"abstract":"","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"78 2","pages":""},"PeriodicalIF":0.0,"publicationDate":"2024-01-18","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"139526245","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Associative Operator Precedence Parsing: A Method To Increase Data Parsing Parallelism","authors":"Le Li, K. Taura","doi":"10.1145/3578178.3578233","DOIUrl":"https://doi.org/10.1145/3578178.3578233","url":null,"abstract":"Many data often come with a high volume in textual format (JSON, XML, CSV). Because parsing can easily dominate data analysis time, researchers have been working on parallelizing parsing. Operator Precedence Parsing (OPP), among candidate parsing methods, is amenable to parallelization, with a practical algorithm proposed. The “locally parsable” property allows the parser to deduce if a reduction is safe with limited context. However, when the grammar has productions that tend to produce a highly skewed parse tree, OPP raises reductions mostly in serial, and the parsing still suffers from a long critical path. In pactice, OPP has little or even no speedup when parsing data because data often contain high percentage of parallel elements (e.g., JSON array elements separated by commas) produced from such productions, a situation that frequently occurs when processing big data. To address this issue and scale textual data parsing, we propose a parsing algorithm that lifts the restriction of deterministic parsing. For an ambiguous grammar, the parser non-deterministically produces a subtree for parallel elements. Such parsers can still produce deterministic semantics when the operator that connects these subtrees is considered associative for data analysis (e.g., map-union). We thus name the algorithm Associative OPP (AOPP), where parsing a large sequence of parallel elements can enjoy much parallelism as reductions can happen in any order. We show that AOPP is of practical use and scales in most cases through textual data parsing.","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"25 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2023-02-27","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"121887171","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Exploiting Data Parallelism in Graph-Based Simultaneous Localization and Mapping: A Case Study with GPU Accelerations","authors":"Junyuan Zheng, Yuan He, Masaaki Kondo","doi":"10.1145/3578178.3578237","DOIUrl":"https://doi.org/10.1145/3578178.3578237","url":null,"abstract":"Graph-based simultaneous localization and mapping (G-SLAM) is an intuitive SLAM implementation where graphs are used to represent poses, landmarks and sensor measurements when a mobile robot builds a map of the environment and locates itself in it. Being a very important application employed in many realistic scenarios, estimating the whole environment and all trajectories through solving graph problems for SLAM can incur a large amount of computation and consume a significant amount of energy. For the purpose of improving both performance and energy efficiency, we have unveiled the critical path of the G-SLAM algorithm in this paper and implemented a GPU-based solution to aid it. Furthermore, we have attempted to offload performance-critical components (such as matrix inversions when updating the trajectory) in the G-SLAM process into GPUs through CUDA to exploit data parallelism. With our solution, we observe a speed-up of up to 19.7x and an energy saving of up to 83.7% over a modern workstation class x86 CPU; while on a platform dedicated for edge computing (NVIDIA Jetson Nano), we achieve a speed-up of up to 2.5x and an energy saving of up to 6.4% with its integrated GPU, respectively.","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"17 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2023-02-27","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"120945926","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}
{"title":"Memory Usage Prediction of HPC Workloads Using Feature Engineering and Machine Learning","authors":"Md Nahid Newaz, Md Atiqul Mollah","doi":"10.1145/3578178.3578241","DOIUrl":"https://doi.org/10.1145/3578178.3578241","url":null,"abstract":"In High Performance Computing (HPC) systems, numerous applications of varying scale and domain are scheduled to run concurrently, and share the available CPU and memory capacities among themselves. Applications whose run-time memory usage are not known a priori, are commonly allocated with significantly higher amounts of memory than actually needed, which leads to poor resource utilization and performance degradation of the overall system. In this paper, we disseminate our experience of performing user analysis and prediction over a large-scale resource utilization dataset to tightly estimate the memory requirements of a wide variety of applications in the Titan supercomputer system. By coupling our engineered features with random forest and XGBoost supervised machine learning techniques, our models respectively predict the correct class of memory usage in 89% and 90% of the validation data. Furthermore, more than 98% of users have 95% or better average prediction accuracy within one class tolerance range of the actual memory usage.","PeriodicalId":314778,"journal":{"name":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","volume":"42 1","pages":"0"},"PeriodicalIF":0.0,"publicationDate":"2023-02-27","publicationTypes":"Journal Article","fieldsOfStudy":null,"isOpenAccess":false,"openAccessPdf":"","citationCount":null,"resultStr":null,"platform":"Semanticscholar","paperid":"121574522","PeriodicalName":null,"FirstCategoryId":null,"ListUrlMain":null,"RegionNum":0,"RegionCategory":"","ArticlePicture":[],"TitleCN":null,"AbstractTextCN":null,"PMCID":"","EPubDate":null,"PubModel":null,"JCR":null,"JCRName":null,"Score":null,"Total":0}