We introduce AnnoHub, an on-going effort to automatically complement existing language resources with metadata about the languages they cover and the annotation schemes (tagsets) that they apply, to provide a web interface for their curation and evaluation by means of domain experts, and to publish them as a RDF dataset and as part of the (Linguistic) Linked Open Data (LLOD) cloud. In this paper, we focus on tabular formats with tab-separated values (TSV), a de-facto standard for annotated corpora as popularized as part of the CoNLL Shared Tasks. By extension, other formats for which a converter to CoNLL and/or TSV formats does exist, can be processed analoguously. We describe our implementation and its evaluation against a sample of 93 corpora from the Universal Dependencies, v.2.3.
@InProceedings{abromeit_et_al:OASIcs.LDK.2019.23, author = {Abromeit, Frank and Chiarcos, Christian}, title = {{Automatic Detection of Language and Annotation Model Information in CoNLL Corpora}}, booktitle = {2nd Conference on Language, Data and Knowledge (LDK 2019)}, pages = {23:1--23:9}, series = {Open Access Series in Informatics (OASIcs)}, ISBN = {978-3-95977-105-4}, ISSN = {2190-6807}, year = {2019}, volume = {70}, editor = {Eskevich, Maria and de Melo, Gerard and F\"{a}th, Christian and McCrae, John P. and Buitelaar, Paul and Chiarcos, Christian and Klimek, Bettina and Dojchinovski, Milan}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/OASIcs.LDK.2019.23}, URN = {urn:nbn:de:0030-drops-103873}, doi = {10.4230/OASIcs.LDK.2019.23}, annote = {Keywords: LLOD, CoNLL, OLiA} }
Feedback for Dagstuhl Publishing