Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

same_if_short_relative_edit_dist_per_words.py 1.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  1. from typing import Optional, Tuple
  2. import Levenshtein
  3. from bohrapi.artifacts.identity import Identity
  4. from bohrapi.core import Heuristic
  5. from bohrlabels.core import OneOrManyLabels
  6. from bohrlabels.labels import MatchLabel
  7. @Heuristic(Identity, Identity)
  8. def same_if_short_relative_edit_dist_per_words(
  9. identities: Tuple[Identity, Identity]
  10. ) -> Optional[OneOrManyLabels]:
  11. """
  12. >>> same_if_short_relative_edit_dist_per_words((Identity({"names": ["Hlib Bue"]}), Identity({"names": ["Hlib Babiy"]})))
  13. MatchLabel.NoMatch
  14. >>> same_if_short_relative_edit_dist_per_words((Identity({"names": ["Hlib Babii"]}), Identity({"names": ["Andrew Babii"]})))
  15. MatchLabel.NoMatch
  16. >>> same_if_short_relative_edit_dist_per_words((Identity({"names": ["Andrii Babii"]}), Identity({"names": ["Andrew Babii"]})))
  17. MatchLabel.Match
  18. >>> same_if_short_relative_edit_dist_per_words((Identity({}), Identity({}))) is None
  19. True
  20. """
  21. name1 = identities[0].name
  22. name2 = identities[1].name
  23. if name1 is not None and name2 is not None:
  24. if len(spl1 := name1.split(" ")) >= 2 and len(spl2 := name2.split(" ")) >= 2:
  25. distance1 = Levenshtein.distance(spl1[0], spl2[0])
  26. distance2 = Levenshtein.distance(spl1[1], spl2[1])
  27. max_length1 = max(len(spl1[0]), len(spl2[0]))
  28. max_length2 = max(len(spl1[1]), len(spl2[1]))
  29. return (
  30. MatchLabel.Match
  31. if min(
  32. (max_length1 - distance1) / max_length1,
  33. (max_length2 - distance2) / max_length2,
  34. )
  35. >= 0.6
  36. else MatchLabel.NoMatch
  37. )
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...