MinHash sketching is an important algorithm for efficient document retrieval and bioinformatics. We show that the value of the matching MinHash codes convey additional information about the Jaccard similarity of S and T over and above the fact that the MinHash codes agree. This observation holds the potential to increase the sensitivity of minhash-based retrieval systems. We analyze the expected Jaccard similarity of two sets as a function of observing a matching MinHash value a under a reasonable prior distribution on intersection set sizes, and present a practical approach to using MinHash values to improve the sensitivity of traditional Jaccard similarity estimation, based on the Kolmogorov-Smirnov statistical test for sample distributions. Experiments over a wide range of hash function counts and set similarities show a small but consistent improvement over chance at predicting over/under-estimation, yielding an average accuracy of 61% over the range of experiments.
@InProceedings{kucherov_et_al:LIPIcs.CPM.2023.20, author = {Kucherov, Gregory and Skiena, Steven}, title = {{Improving the Sensitivity of MinHash Through Hash-Value Analysis}}, booktitle = {34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023)}, pages = {20:1--20:12}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-276-1}, ISSN = {1868-8969}, year = {2023}, volume = {259}, editor = {Bulteau, Laurent and Lipt\'{a}k, Zsuzsanna}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.CPM.2023.20}, URN = {urn:nbn:de:0030-drops-179740}, doi = {10.4230/LIPIcs.CPM.2023.20}, annote = {Keywords: MinHash sketching, sequence similarity, hashing} }
Feedback for Dagstuhl Publishing