-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibliography.bib
220 lines (199 loc) · 8.93 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
@misc{ufal_medical_corpus,
title = {UFAL Medical Corpus},
url = {https://ufal.mff.cuni.cz/ufal_medical_corpus},
note = {Accessed [2023.10.21]}
}
@misc{LaBSEHugging,
title = {LaBSE},
url = {https://huggingface.co/sentence-transformers/LaBSE},
note = {Accessed [2023.10.21]}
}
@misc{khresmoi_summary_translation_test_data_2.0,
title = {Khresmoi Summary Translation Test Data 2.0},
author = {Du{\v s}ek, Ond{\v r}ej and Haji{\v c}, Jan and Hlav{\'a}{\v c}ov{\'a}, Jaroslava and Libovick{\'y}, Jind{\v r}ich and Pecina, Pavel and Tamchyna, Ale{\v s} and Ure{\v s}ov{\'a}, Zde{\v n}ka},
url = {http://hdl.handle.net/11234/1-2122},
note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University},
copyright = {Creative Commons - Attribution-{NonCommercial} 4.0 International ({CC} {BY}-{NC} 4.0)},
year = {2017} }
@misc{MBart50Tokenizer,
title = {MBart50Tokenizer},
url = {https://huggingface.co/docs/transformers/model_doc/mbart},
note = {Accessed [2023.10.21]}
}
@article{tang2020multilingual,
title={Multilingual Translation with Extensible Multilingual Pretraining and Finetuning},
author={Yuqing Tang and Chau Tran and Xian Li and Peng-Jen Chen and Naman Goyal and Vishrav Chaudhary and Jiatao Gu and Angela Fan},
year={2020},
eprint={2008.00401},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{mBartParameterSize,
Author = {Asahi Ushio and Yi Zhou and Jose Camacho-Collados},
Title = {An Efficient Multilingual Language Model Compression through Vocabulary Trimming},
Year = {2023},
Eprint = {arXiv:2305.15020},
}
@misc{LASER-github,
author = {},
title = {GitHub - facebookresearch/LASER: Language-Agnostic SEntence Representations --- github.com},
howpublished = {\url{https://github.com/facebookresearch/LASER}},
year = {},
note = {[Accessed 10-12-2023]},
}
@inproceedings{LASER,
title = "Learning Joint Multilingual Sentence Representations with Neural Machine Translation",
author = "Schwenk, Holger and
Douze, Matthijs",
editor = "Blunsom, Phil and
Bordes, Antoine and
Cho, Kyunghyun and
Cohen, Shay and
Dyer, Chris and
Grefenstette, Edward and
Hermann, Karl Moritz and
Rimell, Laura and
Weston, Jason and
Yih, Scott",
booktitle = "Proceedings of the 2nd Workshop on Representation Learning for {NLP}",
month = aug,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-2619",
doi = "10.18653/v1/W17-2619",
pages = "157--167",
abstract = "In this paper, we use the framework of neural machine translation to learn joint sentence representations across six very different languages. Our aim is that a representation which is independent of the language, is likely to capture the underlying semantics. We define a new cross-lingual similarity measure, compare up to 1.4M sentence representations and study the characteristics of close sentences. We provide experimental evidence that sentences that are close in embedding space are indeed semantically highly related, but often have quite different structure and syntax. These relations also hold when comparing sentences in different languages.",
}
@inproceedings{bane2021selecting,
title={Selecting the best data filtering method for NMT training},
author={Bane, Fred and Zaretskaya, Anna},
booktitle={Proceedings of Machine Translation Summit XVIII: Users and Providers Track},
pages={89--97},
year={2021}
}
@inproceedings{bane2022comparison,
title={A Comparison of Data Filtering Methods for Neural Machine Translation},
author={Bane, Fred and Uguet, Celia Soler and Stribi{\.z}ew, Wiktor and Zaretskaya, Anna},
booktitle={Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)},
pages={313--325},
year={2022}
}
@misc{feng2022languageagnostic,
title={Language-agnostic BERT Sentence Embedding},
author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang},
year={2022},
eprint={2007.01852},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@inproceedings{koehn-etal-2018-findings,
title = "Findings of the {WMT} 2018 Shared Task on Parallel Corpus Filtering",
author = "Koehn, Philipp and
Khayrallah, Huda and
Heafield, Kenneth and
Forcada, Mikel L.",
editor = "Bojar, Ond{\v{r}}ej and
Chatterjee, Rajen and
Federmann, Christian and
Fishel, Mark and
Graham, Yvette and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Monz, Christof and
Negri, Matteo and
N{\'e}v{\'e}ol, Aur{\'e}lie and
Neves, Mariana and
Post, Matt and
Specia, Lucia and
Turchi, Marco and
Verspoor, Karin",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-6453",
doi = "10.18653/v1/W18-6453",
pages = "726--739",
abstract = "We posed the shared task of assigning sentence-level quality scores for a very noisy corpus of sentence pairs crawled from the web, with the goal of sub-selecting 1{\%} and 10{\%} of high-quality data to be used to train machine translation systems. Seventeen participants from companies, national research labs, and universities participated in this task.",
}
@inproceedings{taghipour2011parallel,
title={Parallel corpus refinement as an outlier detection algorithm},
author={Taghipour, Kaveh and Khadivi, Shahram and Xu, Jia},
booktitle={Proceedings of Machine Translation Summit XIII: Papers},
year={2011}
}
@inproceedings{xu2017zipporah,
title={Zipporah: a fast and scalable data cleaning system for noisy web-crawled parallel corpora},
author={Xu, Hainan and Koehn, Philipp},
booktitle={Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
pages={2945--2950},
year={2017}
}
@inproceedings{cui2013bilingual,
title={Bilingual data cleaning for smt using graph-based random walk},
author={Cui, Lei and Zhang, Dongdong and Liu, Shujie and Li, Mu and Zhou, Ming},
booktitle={Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
pages={340--345},
year={2013}
}
@inproceedings{accarcciccek2020filtering,
title={Filtering noisy parallel corpus using transformers with proxy task learning},
author={A{\c{c}}ar{\c{c}}i{\c{c}}ek, Haluk and {\c{C}}olako{\u{g}}lu, Talha and Hatipo{\u{g}}lu, P{\i}nar Ece Aktan and Huang, Chong Hsuan and Peng, Wei},
booktitle={Proceedings of the Fifth Conference on Machine Translation},
pages={940--946},
year={2020}
}
@article{chaudhary2019low,
title={Low-resource corpus filtering using multilingual sentence embeddings},
author={Chaudhary, Vishrav and Tang, Yuqing and Guzm{\'a}n, Francisco and Schwenk, Holger and Koehn, Philipp},
journal={arXiv preprint arXiv:1906.08885},
year={2019}
}
@article{conneau2017word,
title={Word Translation Without Parallel Data},
author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
journal={arXiv preprint arXiv:1710.04087},
year={2017}
}
@misc{metaMUSE,
author = {},
title = {MUSE},
howpublished = {\url{https://ai.meta.com/tools/muse/}},
year = {2023},
note = {[Accessed 16-12-2023]},
}
@article{BERT,
title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
url={https://arxiv.org/pdf/1810.04805.pdf},
author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
year={2019},
month={May} }
@article{sacreBleu,
author = {Matt Post},
title = {A Call for Clarity in Reporting {BLEU} Scores},
journal = {CoRR},
volume = {abs/1804.08771},
year = {2018},
url = {http://arxiv.org/abs/1804.08771},
eprinttype = {arXiv},
eprint = {1804.08771},
timestamp = {Mon, 13 Aug 2018 16:46:34 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1804-08771.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{bleu,
title={Bleu: a method for automatic evaluation of machine translation},
author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics},
pages={311--318},
year={2002}
}
@misc{bert-hugging-face,
title={BERT multilingual base model (cased)},
url={https://huggingface.co/bert-base-multilingual-cased},
journal={huggingface.co},
note = {[Accessed 10-12-2023]},}