Python has a growing community of users, especially in the AI and ML fields. Yet, Computational Processing of Portuguese in this programming language is limited, in both available tools and results. This paper describes NLPyPort, a NLP pipeline in Python, primarily based on NLTK, and focused on Portuguese. It is mostly assembled from pre-existent resources or their adaptations, but improves over the performance of existing alternatives in Python, namely in the tasks of tokenization, PoS tagging, lemmatization and NER.
@InProceedings{ferreira_et_al:OASIcs.SLATE.2019.18, author = {Ferreira, Jo\~{a}o and Gon\c{c}alo Oliveira, Hugo and Rodrigues, Ricardo}, title = {{Improving NLTK for Processing Portuguese}}, booktitle = {8th Symposium on Languages, Applications and Technologies (SLATE 2019)}, pages = {18:1--18:9}, series = {Open Access Series in Informatics (OASIcs)}, ISBN = {978-3-95977-114-6}, ISSN = {2190-6807}, year = {2019}, volume = {74}, editor = {Rodrigues, Ricardo and Janou\v{s}ek, Jan and Ferreira, Lu{\'\i}s and Coheur, Lu{\'\i}sa and Batista, Fernando and Gon\c{c}alo Oliveira, Hugo}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/OASIcs.SLATE.2019.18}, URN = {urn:nbn:de:0030-drops-108852}, doi = {10.4230/OASIcs.SLATE.2019.18}, annote = {Keywords: NLP, Tokenization, PoS tagging, Lemmatization, Named Entity Recognition} }
Feedback for Dagstuhl Publishing