@inproceedings{70b4b18453c1417cbd6e6036675965e7,
title = "Constructing High Quality Bilingual Corpus using Parallel Data from the Web",
abstract = "Natural language machine translation system requires a high-quality bilingual corpus to support its efficient translation operation at high accuracy rate. In this paper, we propose a bilingual corpus construction method using parallel data from the Web. It acts as a stimulus to significantly speed up the construction. In our proposal, there are 4 phases. Parallel data is first pre-processed and refined into three sets of data for training the CNN model. Using the well-trained model, future parallel data can be selected, classified and added to the bilingual corpus. The training result showed that the test accuracy reached 98.46%. Furthermore, the result on precision, recall and f1-score is greater than 0.9, which outperforms RNN and LSTM models.",
keywords = "Bilingual Corpus, CNN Modelling, Machine Translation, Parallel Data",
author = "Cheok, {Sai Man} and Hoi, {Lap Man} and Tang, {Su Kit} and Rita Tse",
note = "Publisher Copyright: Copyright {\textcopyright} 2022 by SCITEPRESS – Science and Technology Publications, Lda. All rights reserved.; 7th International Conference on Internet of Things, Big Data and Security, IoTBDS 2022 ; Conference date: 22-04-2022 Through 24-04-2022",
year = "2022",
doi = "10.5220/0010997000003194",
language = "English",
series = "International Conference on Internet of Things, Big Data and Security, IoTBDS - Proceedings",
publisher = "Science and Technology Publications, Lda",
pages = "127--132",
editor = "Denis Bastieri and Gary Wills and Peter Kacsuk and Victor Chang",
booktitle = "IoTBDS 2022 - Proceedings of the 7th International Conference on Internet of Things, Big Data and Security",
}