A bitext produced from a Portuguese historical text and its English translation, Fernão Mendes Pinto's Pilgrimage, serves as a case study to describe the creation of a parallel corpus and investigate which linguistic and textual units are the best indicators of alignability. The process of building the corpus goes through preparation of transcriptions, annotation, segmentation and sentence alignment. Once the bitext is ready, the corpus is used to inquire which units appear as more relevant to predict that both texts are parallel. From the largest content units, those of chapters, to sentences, word types, tokens and characters, the latest, despite being the unit with less textual and linguistic significance, were found to be the best indicator of both texts being alignable.
@InProceedings{canosa:OASIcs.SLATE.2018.16, author = {Canosa, Afonso Xavier}, title = {{Comparison of Segmentable Units as Indicators of Two Texts Being Parallel}}, booktitle = {7th Symposium on Languages, Applications and Technologies (SLATE 2018)}, pages = {16:1--16:7}, series = {Open Access Series in Informatics (OASIcs)}, ISBN = {978-3-95977-072-9}, ISSN = {2190-6807}, year = {2018}, volume = {62}, editor = {Henriques, Pedro Rangel and Leal, Jos\'{e} Paulo and Leit\~{a}o, Ant\'{o}nio Menezes and Guinovart, Xavier G\'{o}mez}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/OASIcs.SLATE.2018.16}, URN = {urn:nbn:de:0030-drops-92747}, doi = {10.4230/OASIcs.SLATE.2018.16}, annote = {Keywords: parallel corpora, text alignment, bitexts} }
Feedback for Dagstuhl Publishing