Skip to content

Commit 90704ee

Browse files
committed
added Prefix/Postfix similarity
1 parent 87ee0dd commit 90704ee

File tree

8 files changed

+519
-2
lines changed

8 files changed

+519
-2
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
## Changelog
22

3+
4+
### [1.9.0] - 2022-10-22
5+
#### Added
6+
- added `Prefix`/`Postfix` similarity
7+
38
### [1.8.0] - 2022-10-02
49
#### Fixed
510
- fixed incorrect score_cutoff handling in `lcs_seq_distance`

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
2020
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
2121
endif()
2222

23-
project(rapidfuzz LANGUAGES CXX VERSION 1.8.0)
23+
project(rapidfuzz LANGUAGES CXX VERSION 1.9.0)
2424

2525
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
2626
include(GNUInstallDirs)

extras/rapidfuzz_amalgamated.hpp

Lines changed: 247 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
22
// SPDX-License-Identifier: MIT
33
// RapidFuzz v1.0.2
4-
// Generated: 2022-10-02 12:26:03.961379
4+
// Generated: 2022-10-22 15:47:34.253072
55
// ----------------------------------------------------------
66
// This file is an amalgamation of multiple different files.
77
// You probably shouldn't edit it directly.
@@ -7674,6 +7674,252 @@ CachedOSA(InputIt1 first1, InputIt1 last1) -> CachedOSA<iter_value_t<InputIt1>>;
76747674

76757675
} // namespace rapidfuzz
76767676

7677+
#include <cmath>
7678+
#include <numeric>
7679+
7680+
#include "rapidfuzz/details/common.hpp"
7681+
7682+
namespace rapidfuzz::detail {
7683+
7684+
class Prefix : public SimilarityBase<Prefix, int64_t, 0, std::numeric_limits<int64_t>::max()> {
7685+
friend SimilarityBase<Prefix, int64_t, 0, std::numeric_limits<int64_t>::max()>;
7686+
friend NormalizedMetricBase<Prefix>;
7687+
7688+
template <typename InputIt1, typename InputIt2>
7689+
static int64_t maximum(Range<InputIt1> s1, Range<InputIt2> s2)
7690+
{
7691+
return std::max(s1.size(), s2.size());
7692+
}
7693+
7694+
template <typename InputIt1, typename InputIt2>
7695+
static int64_t _similarity(Range<InputIt1> s1, Range<InputIt2> s2, int64_t score_cutoff)
7696+
{
7697+
int64_t dist = static_cast<int64_t>(remove_common_prefix(s1, s2));
7698+
return (dist >= score_cutoff) ? dist : 0;
7699+
}
7700+
};
7701+
7702+
} // namespace rapidfuzz::detail
7703+
7704+
namespace rapidfuzz {
7705+
7706+
template <typename InputIt1, typename InputIt2>
7707+
int64_t prefix_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
7708+
int64_t score_cutoff = std::numeric_limits<int64_t>::max())
7709+
{
7710+
return detail::Prefix::distance(first1, last1, first2, last2, score_cutoff);
7711+
}
7712+
7713+
template <typename Sentence1, typename Sentence2>
7714+
int64_t prefix_distance(const Sentence1& s1, const Sentence2& s2,
7715+
int64_t score_cutoff = std::numeric_limits<int64_t>::max())
7716+
{
7717+
return detail::Prefix::distance(s1, s2, score_cutoff);
7718+
}
7719+
7720+
template <typename InputIt1, typename InputIt2>
7721+
int64_t prefix_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
7722+
int64_t score_cutoff = 0)
7723+
{
7724+
return detail::Prefix::similarity(first1, last1, first2, last2, score_cutoff);
7725+
}
7726+
7727+
template <typename Sentence1, typename Sentence2>
7728+
int64_t prefix_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0)
7729+
{
7730+
return detail::Prefix::similarity(s1, s2, score_cutoff);
7731+
}
7732+
7733+
template <typename InputIt1, typename InputIt2>
7734+
double prefix_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
7735+
double score_cutoff = 1.0)
7736+
{
7737+
return detail::Prefix::normalized_distance(first1, last1, first2, last2, score_cutoff);
7738+
}
7739+
7740+
template <typename Sentence1, typename Sentence2>
7741+
double prefix_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
7742+
{
7743+
return detail::Prefix::normalized_distance(s1, s2, score_cutoff);
7744+
}
7745+
7746+
template <typename InputIt1, typename InputIt2>
7747+
double prefix_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
7748+
double score_cutoff = 0.0)
7749+
{
7750+
return detail::Prefix::normalized_similarity(first1, last1, first2, last2, score_cutoff);
7751+
}
7752+
7753+
template <typename Sentence1, typename Sentence2>
7754+
double prefix_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
7755+
{
7756+
return detail::Prefix::normalized_similarity(s1, s2, score_cutoff);
7757+
}
7758+
7759+
template <typename CharT1>
7760+
struct CachedPrefix : public detail::CachedSimilarityBase<CachedPrefix<CharT1>, int64_t, 0,
7761+
std::numeric_limits<int64_t>::max()> {
7762+
template <typename Sentence1>
7763+
explicit CachedPrefix(const Sentence1& s1_) : CachedPrefix(detail::to_begin(s1_), detail::to_end(s1_))
7764+
{}
7765+
7766+
template <typename InputIt1>
7767+
CachedPrefix(InputIt1 first1, InputIt1 last1) : s1(first1, last1)
7768+
{}
7769+
7770+
private:
7771+
friend detail::CachedSimilarityBase<CachedPrefix<CharT1>, int64_t, 0,
7772+
std::numeric_limits<int64_t>::max()>;
7773+
friend detail::CachedNormalizedMetricBase<CachedPrefix<CharT1>>;
7774+
7775+
template <typename InputIt2>
7776+
int64_t maximum(detail::Range<InputIt2> s2) const
7777+
{
7778+
return std::max(static_cast<int64_t>(s1.size()), s2.size());
7779+
}
7780+
7781+
template <typename InputIt2>
7782+
int64_t _similarity(detail::Range<InputIt2> s2, int64_t score_cutoff) const
7783+
{
7784+
return detail::Prefix::similarity(s1, s2, score_cutoff);
7785+
}
7786+
7787+
std::basic_string<CharT1> s1;
7788+
};
7789+
7790+
template <typename Sentence1>
7791+
explicit CachedPrefix(const Sentence1& s1_) -> CachedPrefix<char_type<Sentence1>>;
7792+
7793+
template <typename InputIt1>
7794+
CachedPrefix(InputIt1 first1, InputIt1 last1) -> CachedPrefix<iter_value_t<InputIt1>>;
7795+
7796+
/**@}*/
7797+
7798+
} // namespace rapidfuzz
7799+
7800+
#include <cmath>
7801+
#include <numeric>
7802+
7803+
#include "rapidfuzz/details/common.hpp"
7804+
7805+
namespace rapidfuzz::detail {
7806+
7807+
class Postfix : public SimilarityBase<Postfix, int64_t, 0, std::numeric_limits<int64_t>::max()> {
7808+
friend SimilarityBase<Postfix, int64_t, 0, std::numeric_limits<int64_t>::max()>;
7809+
friend NormalizedMetricBase<Postfix>;
7810+
7811+
template <typename InputIt1, typename InputIt2>
7812+
static int64_t maximum(Range<InputIt1> s1, Range<InputIt2> s2)
7813+
{
7814+
return std::max(s1.size(), s2.size());
7815+
}
7816+
7817+
template <typename InputIt1, typename InputIt2>
7818+
static int64_t _similarity(Range<InputIt1> s1, Range<InputIt2> s2, int64_t score_cutoff)
7819+
{
7820+
int64_t dist = static_cast<int64_t>(remove_common_suffix(s1, s2));
7821+
return (dist >= score_cutoff) ? dist : 0;
7822+
}
7823+
};
7824+
7825+
} // namespace rapidfuzz::detail
7826+
7827+
namespace rapidfuzz {
7828+
7829+
template <typename InputIt1, typename InputIt2>
7830+
int64_t postfix_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
7831+
int64_t score_cutoff = std::numeric_limits<int64_t>::max())
7832+
{
7833+
return detail::Postfix::distance(first1, last1, first2, last2, score_cutoff);
7834+
}
7835+
7836+
template <typename Sentence1, typename Sentence2>
7837+
int64_t postfix_distance(const Sentence1& s1, const Sentence2& s2,
7838+
int64_t score_cutoff = std::numeric_limits<int64_t>::max())
7839+
{
7840+
return detail::Postfix::distance(s1, s2, score_cutoff);
7841+
}
7842+
7843+
template <typename InputIt1, typename InputIt2>
7844+
int64_t postfix_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
7845+
int64_t score_cutoff = 0)
7846+
{
7847+
return detail::Postfix::similarity(first1, last1, first2, last2, score_cutoff);
7848+
}
7849+
7850+
template <typename Sentence1, typename Sentence2>
7851+
int64_t postfix_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0)
7852+
{
7853+
return detail::Postfix::similarity(s1, s2, score_cutoff);
7854+
}
7855+
7856+
template <typename InputIt1, typename InputIt2>
7857+
double postfix_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
7858+
double score_cutoff = 1.0)
7859+
{
7860+
return detail::Postfix::normalized_distance(first1, last1, first2, last2, score_cutoff);
7861+
}
7862+
7863+
template <typename Sentence1, typename Sentence2>
7864+
double postfix_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
7865+
{
7866+
return detail::Postfix::normalized_distance(s1, s2, score_cutoff);
7867+
}
7868+
7869+
template <typename InputIt1, typename InputIt2>
7870+
double postfix_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
7871+
double score_cutoff = 0.0)
7872+
{
7873+
return detail::Postfix::normalized_similarity(first1, last1, first2, last2, score_cutoff);
7874+
}
7875+
7876+
template <typename Sentence1, typename Sentence2>
7877+
double postfix_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
7878+
{
7879+
return detail::Postfix::normalized_similarity(s1, s2, score_cutoff);
7880+
}
7881+
7882+
template <typename CharT1>
7883+
struct CachedPostfix : public detail::CachedSimilarityBase<CachedPostfix<CharT1>, int64_t, 0,
7884+
std::numeric_limits<int64_t>::max()> {
7885+
template <typename Sentence1>
7886+
explicit CachedPostfix(const Sentence1& s1_) : CachedPostfix(detail::to_begin(s1_), detail::to_end(s1_))
7887+
{}
7888+
7889+
template <typename InputIt1>
7890+
CachedPostfix(InputIt1 first1, InputIt1 last1) : s1(first1, last1)
7891+
{}
7892+
7893+
private:
7894+
friend detail::CachedSimilarityBase<CachedPostfix<CharT1>, int64_t, 0,
7895+
std::numeric_limits<int64_t>::max()>;
7896+
friend detail::CachedNormalizedMetricBase<CachedPostfix<CharT1>>;
7897+
7898+
template <typename InputIt2>
7899+
int64_t maximum(detail::Range<InputIt2> s2) const
7900+
{
7901+
return std::max(static_cast<int64_t>(s1.size()), s2.size());
7902+
}
7903+
7904+
template <typename InputIt2>
7905+
int64_t _similarity(detail::Range<InputIt2> s2, int64_t score_cutoff) const
7906+
{
7907+
return detail::Postfix::similarity(s1, s2, score_cutoff);
7908+
}
7909+
7910+
std::basic_string<CharT1> s1;
7911+
};
7912+
7913+
template <typename Sentence1>
7914+
explicit CachedPostfix(const Sentence1& s1_) -> CachedPostfix<char_type<Sentence1>>;
7915+
7916+
template <typename InputIt1>
7917+
CachedPostfix(InputIt1 first1, InputIt1 last1) -> CachedPostfix<iter_value_t<InputIt1>>;
7918+
7919+
/**@}*/
7920+
7921+
} // namespace rapidfuzz
7922+
76777923
namespace rapidfuzz {
76787924

76797925
template <typename CharT, typename InputIt1, typename InputIt2>

rapidfuzz/distance.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include <rapidfuzz/distance/LCSseq.hpp>
1111
#include <rapidfuzz/distance/Levenshtein.hpp>
1212
#include <rapidfuzz/distance/OSA.hpp>
13+
#include <rapidfuzz/distance/Prefix.hpp>
14+
#include <rapidfuzz/distance/Postfix.hpp>
1315

1416
namespace rapidfuzz {
1517

rapidfuzz/distance/Postfix.hpp

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/* SPDX-License-Identifier: MIT */
2+
/* Copyright © 2021 Max Bachmann */
3+
4+
#pragma once
5+
#include <cmath>
6+
#include <numeric>
7+
#include <rapidfuzz/details/common.hpp>
8+
#include <rapidfuzz/distance/Postfix_impl.hpp>
9+
10+
namespace rapidfuzz {
11+
12+
template <typename InputIt1, typename InputIt2>
13+
int64_t postfix_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
14+
int64_t score_cutoff = std::numeric_limits<int64_t>::max())
15+
{
16+
return detail::Postfix::distance(first1, last1, first2, last2, score_cutoff);
17+
}
18+
19+
template <typename Sentence1, typename Sentence2>
20+
int64_t postfix_distance(const Sentence1& s1, const Sentence2& s2,
21+
int64_t score_cutoff = std::numeric_limits<int64_t>::max())
22+
{
23+
return detail::Postfix::distance(s1, s2, score_cutoff);
24+
}
25+
26+
template <typename InputIt1, typename InputIt2>
27+
int64_t postfix_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
28+
int64_t score_cutoff = 0)
29+
{
30+
return detail::Postfix::similarity(first1, last1, first2, last2, score_cutoff);
31+
}
32+
33+
template <typename Sentence1, typename Sentence2>
34+
int64_t postfix_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0)
35+
{
36+
return detail::Postfix::similarity(s1, s2, score_cutoff);
37+
}
38+
39+
template <typename InputIt1, typename InputIt2>
40+
double postfix_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
41+
double score_cutoff = 1.0)
42+
{
43+
return detail::Postfix::normalized_distance(first1, last1, first2, last2, score_cutoff);
44+
}
45+
46+
template <typename Sentence1, typename Sentence2>
47+
double postfix_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
48+
{
49+
return detail::Postfix::normalized_distance(s1, s2, score_cutoff);
50+
}
51+
52+
template <typename InputIt1, typename InputIt2>
53+
double postfix_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
54+
double score_cutoff = 0.0)
55+
{
56+
return detail::Postfix::normalized_similarity(first1, last1, first2, last2, score_cutoff);
57+
}
58+
59+
template <typename Sentence1, typename Sentence2>
60+
double postfix_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
61+
{
62+
return detail::Postfix::normalized_similarity(s1, s2, score_cutoff);
63+
}
64+
65+
template <typename CharT1>
66+
struct CachedPostfix : public detail::CachedSimilarityBase<CachedPostfix<CharT1>, int64_t, 0,
67+
std::numeric_limits<int64_t>::max()> {
68+
template <typename Sentence1>
69+
explicit CachedPostfix(const Sentence1& s1_) : CachedPostfix(detail::to_begin(s1_), detail::to_end(s1_))
70+
{}
71+
72+
template <typename InputIt1>
73+
CachedPostfix(InputIt1 first1, InputIt1 last1) : s1(first1, last1)
74+
{}
75+
76+
private:
77+
friend detail::CachedSimilarityBase<CachedPostfix<CharT1>, int64_t, 0, std::numeric_limits<int64_t>::max()>;
78+
friend detail::CachedNormalizedMetricBase<CachedPostfix<CharT1>>;
79+
80+
template <typename InputIt2>
81+
int64_t maximum(detail::Range<InputIt2> s2) const
82+
{
83+
return std::max(static_cast<int64_t>(s1.size()), s2.size());
84+
}
85+
86+
template <typename InputIt2>
87+
int64_t _similarity(detail::Range<InputIt2> s2, int64_t score_cutoff) const
88+
{
89+
return detail::Postfix::similarity(s1, s2, score_cutoff);
90+
}
91+
92+
std::basic_string<CharT1> s1;
93+
};
94+
95+
template <typename Sentence1>
96+
explicit CachedPostfix(const Sentence1& s1_) -> CachedPostfix<char_type<Sentence1>>;
97+
98+
template <typename InputIt1>
99+
CachedPostfix(InputIt1 first1, InputIt1 last1) -> CachedPostfix<iter_value_t<InputIt1>>;
100+
101+
/**@}*/
102+
103+
} // namespace rapidfuzz

0 commit comments

Comments
 (0)