@inproceedings{6f780ba5f89b4b2399c22fcf28d6f7ef,
title = "Steering a Standard Arab Language Processing Model Towards Accurate Saudi Dialect Sentiment Analysis Using Generative AI",
abstract = "Sentiment analysis (SA) is crucial for many NLP applications across various domains. While Arabic is one of the world's major languages, high-quality NLP models developed for standard Arabic often underperform on regional dialects like the Saudi Dialect (SD) due to a lack of SD-specific training data. This paper presents a novel approach to adapting a high-resource language model, AraBERT, for low-resource dialect sentiment analysis by combining minimal SD data collection with generative AI. In the absence of openly accessible SD datasets, we augmented a small amount of collected SD data with GPT-generated SD data to fine-tune AraBERT for sentiment analysis in SD. Our contributions include (1) demonstrating the feasibility of low-effort data collection of a low-resource dialect for adapting existing high-resource NLP models and (2) leveraging GPT-generated data to augment collected data to enhance a high-resource language model for sentiment classification in a low-resource dialect, achieving significant improvements over the pre-trained high-resource model. These two contributions imply a potentially replicable approach that can serve as a template for future research in other low-resource NLP tasks. This paper presents a promising solution for enhancing model performance in low-resource dialects and has implications for similar under-resourced languages.",
keywords = "Generative AI, NLP, Saudi Dialect, Sentiment Analysis",
author = "Sulaiman Aftan and Yu Zhuang and Aseeri, \{Ahmad O.\} and Habib Shah",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 2024 IEEE International Conference on Big Data, BigData 2024 ; Conference date: 15-12-2024 Through 18-12-2024",
year = "2024",
doi = "10.1109/BigData62323.2024.10825944",
language = "English",
series = "Proceedings - 2024 IEEE International Conference on Big Data, BigData 2024",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "5891--5900",
editor = "Wei Ding and Chang-Tien Lu and Fusheng Wang and Liping Di and Kesheng Wu and Jun Huan and Raghu Nambiar and Jundong Li and Filip Ilievski and Ricardo Baeza-Yates and Xiaohua Hu",
booktitle = "Proceedings - 2024 IEEE International Conference on Big Data, BigData 2024",
address = "United States",
}