bibliography.bib

@misc{ufal_medical_corpus,
    title = {UFAL Medical Corpus},
    url = {https://ufal.mff.cuni.cz/ufal_medical_corpus},
    note = {Accessed [2023.10.21]}
}

@misc{LaBSEHugging,
    title = {LaBSE},
    url = {https://huggingface.co/sentence-transformers/LaBSE},
    note = {Accessed [2023.10.21]}
}
@misc{khresmoi_summary_translation_test_data_2.0,
title = {Khresmoi Summary Translation Test Data 2.0},
author = {Du{\v s}ek, Ond{\v r}ej and Haji{\v c}, Jan and Hlav{\'a}{\v c}ov{\'a}, Jaroslava and Libovick{\'y}, Jind{\v r}ich and Pecina, Pavel and Tamchyna, Ale{\v s} and Ure{\v s}ov{\'a}, Zde{\v n}ka},
url = {http://hdl.handle.net/11234/1-2122},
note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University},
copyright = {Creative Commons - Attribution-{NonCommercial} 4.0 International ({CC} {BY}-{NC} 4.0)},
year = {2017} }

@misc{MBart50Tokenizer,
    title = {MBart50Tokenizer},
    url = {https://huggingface.co/docs/transformers/model_doc/mbart},
    note = {Accessed [2023.10.21]}
}

@article{tang2020multilingual,
    title={Multilingual Translation with Extensible Multilingual Pretraining and Finetuning},
    author={Yuqing Tang and Chau Tran and Xian Li and Peng-Jen Chen and Naman Goyal and Vishrav Chaudhary and Jiatao Gu and Angela Fan},
    year={2020},
    eprint={2008.00401},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{mBartParameterSize,
Author = {Asahi Ushio and Yi Zhou and Jose Camacho-Collados},
Title = {An Efficient Multilingual Language Model Compression through Vocabulary Trimming},
Year = {2023},
Eprint = {arXiv:2305.15020},
}

@misc{LASER-github,
	author = {},
	title = {GitHub - facebookresearch/LASER: Language-Agnostic SEntence Representations --- github.com},
	howpublished = {\url{https://github.com/facebookresearch/LASER}},
	year = {},
	note = {[Accessed 10-12-2023]},
}

@inproceedings{LASER,
    title = "Learning Joint Multilingual Sentence Representations with Neural Machine Translation",
    author = "Schwenk, Holger  and
      Douze, Matthijs",
    editor = "Blunsom, Phil  and
      Bordes, Antoine  and
      Cho, Kyunghyun  and
      Cohen, Shay  and
      Dyer, Chris  and
      Grefenstette, Edward  and
      Hermann, Karl Moritz  and
      Rimell, Laura  and
      Weston, Jason  and
      Yih, Scott",
    booktitle = "Proceedings of the 2nd Workshop on Representation Learning for {NLP}",
    month = aug,
    year = "2017",
    address = "Vancouver, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/W17-2619",
    doi = "10.18653/v1/W17-2619",
    pages = "157--167",
    abstract = "In this paper, we use the framework of neural machine translation to learn joint sentence representations across six very different languages. Our aim is that a representation which is independent of the language, is likely to capture the underlying semantics. We define a new cross-lingual similarity measure, compare up to 1.4M sentence representations and study the characteristics of close sentences. We provide experimental evidence that sentences that are close in embedding space are indeed semantically highly related, but often have quite different structure and syntax. These relations also hold when comparing sentences in different languages.",
}

@inproceedings{bane2021selecting,
  title={Selecting the best data filtering method for NMT training},
  author={Bane, Fred and Zaretskaya, Anna},
  booktitle={Proceedings of Machine Translation Summit XVIII: Users and Providers Track},
  pages={89--97},
  year={2021}
}

@inproceedings{bane2022comparison,
  title={A Comparison of Data Filtering Methods for Neural Machine Translation},
  author={Bane, Fred and Uguet, Celia Soler and Stribi{\.z}ew, Wiktor and Zaretskaya, Anna},
  booktitle={Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)},
  pages={313--325},
  year={2022}
}

@misc{feng2022languageagnostic,
      title={Language-agnostic BERT Sentence Embedding}, 
      author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang},
      year={2022},
      eprint={2007.01852},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{koehn-etal-2018-findings,
    title = "Findings of the {WMT} 2018 Shared Task on Parallel Corpus Filtering",
    author = "Koehn, Philipp  and
      Khayrallah, Huda  and
      Heafield, Kenneth  and
      Forcada, Mikel L.",
    editor = "Bojar, Ond{\v{r}}ej  and
      Chatterjee, Rajen  and
      Federmann, Christian  and
      Fishel, Mark  and
      Graham, Yvette  and
      Haddow, Barry  and
      Huck, Matthias  and
      Yepes, Antonio Jimeno  and
      Koehn, Philipp  and
      Monz, Christof  and
      Negri, Matteo  and
      N{\'e}v{\'e}ol, Aur{\'e}lie  and
      Neves, Mariana  and
      Post, Matt  and
      Specia, Lucia  and
      Turchi, Marco  and
      Verspoor, Karin",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/W18-6453",
    doi = "10.18653/v1/W18-6453",
    pages = "726--739",
    abstract = "We posed the shared task of assigning sentence-level quality scores for a very noisy corpus of sentence pairs crawled from the web, with the goal of sub-selecting 1{\%} and 10{\%} of high-quality data to be used to train machine translation systems. Seventeen participants from companies, national research labs, and universities participated in this task.",
}

@inproceedings{taghipour2011parallel,
  title={Parallel corpus refinement as an outlier detection algorithm},
  author={Taghipour, Kaveh and Khadivi, Shahram and Xu, Jia},
  booktitle={Proceedings of Machine Translation Summit XIII: Papers},
  year={2011}
}

@inproceedings{xu2017zipporah,
  title={Zipporah: a fast and scalable data cleaning system for noisy web-crawled parallel corpora},
  author={Xu, Hainan and Koehn, Philipp},
  booktitle={Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  pages={2945--2950},
  year={2017}
}

@inproceedings{cui2013bilingual,
  title={Bilingual data cleaning for smt using graph-based random walk},
  author={Cui, Lei and Zhang, Dongdong and Liu, Shujie and Li, Mu and Zhou, Ming},
  booktitle={Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  pages={340--345},
  year={2013}
}

@inproceedings{accarcciccek2020filtering,
  title={Filtering noisy parallel corpus using transformers with proxy task learning},
  author={A{\c{c}}ar{\c{c}}i{\c{c}}ek, Haluk and {\c{C}}olako{\u{g}}lu, Talha and Hatipo{\u{g}}lu, P{\i}nar Ece Aktan and Huang, Chong Hsuan and Peng, Wei},
  booktitle={Proceedings of the Fifth Conference on Machine Translation},
  pages={940--946},
  year={2020}
}

@article{chaudhary2019low,
  title={Low-resource corpus filtering using multilingual sentence embeddings},
  author={Chaudhary, Vishrav and Tang, Yuqing and Guzm{\'a}n, Francisco and Schwenk, Holger and Koehn, Philipp},
  journal={arXiv preprint arXiv:1906.08885},
  year={2019}
}

@article{conneau2017word,
  title={Word Translation Without Parallel Data},
  author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
  journal={arXiv preprint arXiv:1710.04087},
  year={2017}
}

@misc{metaMUSE,
	author = {},
	title = {MUSE},
	howpublished = {\url{https://ai.meta.com/tools/muse/}},
	year = {2023},
	note = {[Accessed 16-12-2023]},
}

@article{BERT,
    title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
    url={https://arxiv.org/pdf/1810.04805.pdf}, 
    author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
    year={2019}, 
    month={May} }

@article{sacreBleu,
  author       = {Matt Post},
  title        = {A Call for Clarity in Reporting {BLEU} Scores},
  journal      = {CoRR},
  volume       = {abs/1804.08771},
  year         = {2018},
  url          = {http://arxiv.org/abs/1804.08771},
  eprinttype    = {arXiv},
  eprint       = {1804.08771},
  timestamp    = {Mon, 13 Aug 2018 16:46:34 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1804-08771.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{bleu,
  title={Bleu: a method for automatic evaluation of machine translation},
  author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
  booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
  pages={311--318},
  year={2002}
}

@misc{bert-hugging-face,
    title={BERT multilingual base model (cased)},
    url={https://huggingface.co/bert-base-multilingual-cased},
    journal={huggingface.co},
    note = {[Accessed 10-12-2023]},}