Huertas-Tato, Javier; Martín, Alejandro; Camacho, David
SILT: Efficient transformer training for inter-lingual inference Journal Article
In: Expert Systems with Applications, vol. 200, pp. 116923, 2022, ISSN: 0957-4174.
@article{huertas-tato_silt_2022,
title = {SILT: Efficient transformer training for inter-lingual inference},
author = {Javier Huertas-Tato and Alejandro Martín and David Camacho},
url = {https://www.sciencedirect.com/science/article/pii/S0957417422003578},
doi = {10.1016/j.eswa.2022.116923},
issn = {0957-4174},
year = {2022},
date = {2022-08-01},
urldate = {2022-08-01},
journal = {Expert Systems with Applications},
volume = {200},
pages = {116923},
abstract = {The ability of transformers to perform precision tasks such as question answering, Natural Language Inference (NLI) or summarizing, has enabled them to be ranked as one of the best paradigms to address Natural Language Processing (NLP) tasks. NLI is one of the best scenarios to test these architectures, due to the knowledge required to understand complex sentences and established relationships between a hypothesis and a premise. Nevertheless, these models suffer from the incapacity to generalize to other domains or from difficulties to face multilingual and interlingual scenarios. The leading pathway in the literature to address these issues involve designing and training extremely large architectures, but this causes unpredictable behaviors and establishes barriers which impede broad access and fine tuning. In this paper, we propose a new architecture called Siamese Inter-Lingual Transformer (SILT). This architecture is able to efficiently align multilingual embeddings for Natural Language Inference, allowing for unmatched language pairs to be processed. SILT leverages siamese pre-trained multi-lingual transformers with frozen weights where the two input sentences attend to each other to later be combined through a matrix alignment method. The experimental results carried out in this paper evidence that SILT allows to reduce drastically the number of trainable parameters while allowing for inter-lingual NLI and achieving state-of-the-art performance on common benchmarks.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Martín, Alejandro; Huertas-Tato, Javier; Huertas-García, Álvaro; Villar-Rodríguez, Guillermo; Camacho, David
FacTeR-Check: Semi-automated fact-checking through Semantic Similarity and Natural Language Inference Journal Article
In: arXiv:2110.14532 [cs], 2022, (arXiv: 2110.14532).
@article{martin_facter-check_2022,
title = {FacTeR-Check: Semi-automated fact-checking through Semantic Similarity and Natural Language Inference},
author = {Alejandro Martín and Javier Huertas-Tato and Álvaro Huertas-García and Guillermo Villar-Rodríguez and David Camacho},
url = {http://arxiv.org/abs/2110.14532},
year = {2022},
date = {2022-02-01},
urldate = {2022-02-01},
journal = {arXiv:2110.14532 [cs]},
abstract = {Our society produces and shares overwhelming amounts of information through Online Social Networks (OSNs). Within this environment, misinformation and disinformation have proliferated, becoming a public safety concern in most countries. Allowing the public and professionals to efficiently find reliable evidences about the factual veracity of a claim is a crucial step to mitigate this harmful spread. To this end, we propose FacTeR-Check, a multilingual architecture for semi-automated fact-checking that can be used for either applications designed for the general public and by fact-checking organisations. FacTeR-Check enables retrieving fact-checked information, unchecked claims verification and tracking dangerous information over social media. This architectures involves several modules developed to evaluate semantic similarity, to calculate natural language inference and to retrieve information from Online Social Networks. The union of all these components builds a semi-automated fact-checking tool able of verifying new claims, to extract related evidence, and to track the evolution of a hoax on a OSN. While individual modules are validated on related benchmarks (mainly MSTS and SICK), the complete architecture is validated using a new dataset called NLI19-SP that is publicly released with COVID-19 related hoaxes and tweets from Spanish social media. Our results show state-of-the-art performance on the individual benchmarks, as well as producing a useful analysis of the evolution over time of 61 different hoaxes.},
note = {arXiv: 2110.14532},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Villar-Rodríguez, Guillermo; Souto-Rico, Mónica; Martín, Alejandro
Virality, only the tip of the iceberg: ways of spread and interaction around COVID-19 misinformation in Twitter Journal Article
In: Communication & Society, pp. 239–256, 2022.
@article{villar2022virality,
title = {Virality, only the tip of the iceberg: ways of spread and interaction around COVID-19 misinformation in Twitter},
author = {Guillermo Villar-Rodríguez and Mónica Souto-Rico and Alejandro Martín},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Communication & Society},
pages = {239--256},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Huertas-García, Álvaro; Huertas-Tato, Javier; Martín, Alejandro; Camacho, David
CIVIC-UPM at CheckThat! 2021: Integration of Transformers in Misinformation Detection and Topic Classification Proceedings Article
In: Conference and Labs of the Evaluation Forum (CLEF) Working Notes, pp. 520–530, 2021.
@inproceedings{huertas-garcia_civic-upm_2021,
title = {CIVIC-UPM at CheckThat! 2021: Integration of Transformers in Misinformation Detection and Topic Classification},
author = {Álvaro Huertas-García and Javier Huertas-Tato and Alejandro Martín and David Camacho},
url = {http://ceur-ws.org/Vol-2936/paper-41.pdf},
year = {2021},
date = {2021-05-24},
urldate = {2021-05-24},
booktitle = {Conference and Labs of the Evaluation Forum (CLEF) Working Notes},
pages = {520--530},
abstract = {Online Social Networks (OSNs) growth enables and amplifies the quick spread of harmful, manipulative and false information that influence public opinion while sow conflict on social or political issues. Therefore, the development of tools to detect malicious actors and to identify low-credibility information and misinformation sources is a new crucial challenge in the ever-evolving field of Artificial Intelligence. The scope of this paper is to present a Natural Language Processing (NLP) approach that uses Doc2Vec and different state-of-the-art transformer-based models for the CLEF2021 Checkthat! lab Task 3. Through this approach, the results show that it is possible to achieve 41.43% macro-average F1-score in the misinformation detection (Task A) and 67.65% macro-average F1-score in the topic classification (Task B).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Huertas-García, Álvaro
Automatic information search for countering covid-19 misinformation through semantic similarity Masters Thesis
Universidad Autónoma de Madrid, 2021.
@mastersthesis{huertas-garcia_uam_2021,
title = {Automatic information search for countering covid-19 misinformation through semantic similarity},
author = {Álvaro Huertas-García},
url = {https://repositorio.uam.es/handle/10486/695067},
year = {2021},
date = {2021-02-26},
urldate = {2021-02-26},
school = {Universidad Autónoma de Madrid},
abstract = {Information quality in social media is an increasingly important issue and misinformation problem has become even more critical in the current COVID-19 pandemic, leading people exposed to false and potentially harmful claims and rumours. Civil society organizations, such as the World Health Organization, have demanded a global call for action to promote access to health information and mitigate harm from health misinformation. Consequently, this project pursues countering the spread of COVID-19 infodemic and its potential health hazards. In this work, we give an overall view of models and methods that have been employed in the NLP field from its foundations to the latest state-of-the-art approaches. Focusing on deep learning methods, we propose applying multilingual Transformer models based on siamese networks, also called bi-encoders, combined with ensemble and PCA dimensionality reduction techniques. The goal is to counter COVID-19 misinformation by analyzing the semantic similarity between a claim and tweets from a collection gathered from official fact-checkers verified by the International Fact-Checking Network of the Poynter Institute. It is factual that the number of Internet users increases every year and the language spoken determines access to information online. For this reason, we give a special effort in the application of multilingual models to tackle misinformation across the globe. Regarding semantic similarity, we firstly evaluate these multilingual ensemble models and improve the result in the STS-Benchmark compared to monolingual and single models. Secondly, we enhance the interpretability of the models’ performance through the SentEval toolkit. Lastly, we compare these models’ performance against biomedical models in TREC-COVID task round 1 using the BM25 Okapi ranking method as the baseline. Moreover, we are interested in understanding the ins and outs of misinformation. For that purpose, we extend interpretability using machine learning and deep learning approaches for sentiment analysis and topic modelling. Finally, we developed a dashboard to ease visualization of the results. In our view, the results obtained in this project constitute an excellent initial step toward incorporating multilingualism and will assist researchers and people in countering COVID-19 misinformation.},
keywords = {},
pubstate = {published},
tppubtype = {mastersthesis}
}
Huertas-García, Álvaro; Huertas-Tato, Javier; Martín, Alejandro; Camacho, David
Countering Misinformation Through Semantic-Aware Multilingual Models Proceedings Article
In: Yin, Hujun; Camacho, David; Tino, Peter; Allmendinger, Richard; Tallón-Ballesteros, Antonio J.; Tang, Ke; Cho, Sung-Bae; Novais, Paulo; Nascimento, Susana (Ed.): Intelligent Data Engineering and Automated Learning – IDEAL 2021, pp. 312–323, Springer International Publishing, Cham, 2021, ISBN: 978-3-030-91608-4.
@inproceedings{huertas-garcia_countering_2021,
title = {Countering Misinformation Through Semantic-Aware Multilingual Models},
author = {Álvaro Huertas-García and Javier Huertas-Tato and Alejandro Martín and David Camacho},
editor = {Hujun Yin and David Camacho and Peter Tino and Richard Allmendinger and Antonio J. Tallón-Ballesteros and Ke Tang and Sung-Bae Cho and Paulo Novais and Susana Nascimento},
doi = {10.1007/978-3-030-91608-4_31},
isbn = {978-3-030-91608-4},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {Intelligent Data Engineering and Automated Learning – IDEAL 2021},
pages = {312--323},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {The presence of misinformation and harmful content on social networks is an emerging problem that endangers public health. One of the most successful approaches for detecting, assessing, and providing prompt responses to this misinformation problem is Natural Language Processing (NLP) techniques based on semantic similarity. However, language constitutes one of the most significant barriers to address, denoting the need to develop multilingual tools for an effective fight against misinformation. This paper presents an approach for countering misinformation through a semantic-aware multilingual architecture. Due to the specificity of the task addressed, which involves assessing the level of similarity between a pair of texts in a multilingual scenario, we built an extension of the well-known Semantic Textual Similarity Benchmark (STSb) to 15 languages. This new dataset allows to fine-tune and evaluate multilingual models based on Transformers with a siamese network topology on monolingual and cross-lingual Semantic Textual Similarity (STS) tasks, achieving a maximum average Spearman correlation coefficient of 83.60%. We validate our proposal using the Covid-19 MLIA @ Eval Multilingual Semantic Search Task. The results reported demonstrate that semantic-aware multilingual architectures are successful at measuring the degree of similarity between pairs of texts, while broadening our understanding of the multilingual capabilities of this type of models. The results and the new multilingual STS Benchmark data presented and made publicly in this study constitute an initial step towards extending methods proposed in the literature that employ semantic similarity to combat misinformation at a multilingual level.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Villar-Rodríguez, Guillermo; Huertas-Tato, Javier; Martín, Alejandro; Camacho, David
A la desinformación le gusta la compañía: Representación de bulos de Twitter sobre la COVID-19 mediante embeddings Conference
XIX Conference of the Spanish Association for Artificial Intelligence, 2021.
@conference{villar2021disinfo,
title = {A la desinformación le gusta la compañía: Representación de bulos de Twitter sobre la COVID-19 mediante embeddings},
author = {Guillermo Villar-Rodríguez and Javier Huertas-Tato and Alejandro Martín and David Camacho},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {XIX Conference of the Spanish Association for Artificial Intelligence},
journal = {XIX Conference of the Spanish Association for Artificial Intelligence (pp. 523-528). 978-84-09-30514-8},
pages = {523-528},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}