Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

hebrew_utils.py 4.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
  1. import numpy as np
  2. # Unicode codepoints for nikud:
  3. # NOTE: Some of these are extended nikud which we will not use
  4. # 1456 HEBREW POINT SHEVA
  5. # 1457 HEBREW POINT HATAF SEGOL
  6. # 1458 HEBREW POINT HATAF PATAH
  7. # 1459 HEBREW POINT HATAF QAMATS
  8. # 1460 HEBREW POINT HIRIQ
  9. # 1461 HEBREW POINT TSERE
  10. # 1462 HEBREW POINT SEGOL
  11. # 1463 HEBREW POINT PATAH
  12. # 1464 HEBREW POINT QAMATS
  13. # 1465 HEBREW POINT HOLAM
  14. # 1466 HEBREW POINT HOLAM HASER FOR VAV ***EXTENDED***
  15. # 1467 HEBREW POINT QUBUTS
  16. # 1468 HEBREW POINT DAGESH OR MAPIQ
  17. # 1469 HEBREW POINT METEG ***EXTENDED***
  18. # 1470 HEBREW PUNCTUATION MAQAF ***EXTENDED***
  19. # 1471 HEBREW POINT RAFE ***EXTENDED***
  20. # 1472 HEBREW PUNCTUATION PASEQ ***EXTENDED***
  21. # 1473 HEBREW POINT SHIN DOT
  22. # 1474 HEBREW POINT SIN DOT
  23. NIKUD_START_ORD = 1456
  24. NIKUD_END_ORD = 1474
  25. SPECIAL_ORDS = {1466, 1469, 1470, 1471, 1472}
  26. # Extended nikud: includes symbols such as rafe which we strip, but do not add to texts
  27. EXTENDED_NIKUD = {chr(i) for i in range(NIKUD_START_ORD, NIKUD_END_ORD + 1)}
  28. # Nikud: ordinary nikud that we add to texts
  29. NIKUD = {c for c in EXTENDED_NIKUD if ord(c) not in SPECIAL_ORDS}
  30. N_VOWELS = len(NIKUD) - 3 # not including dagesh, shin dot, sin dot
  31. idx2chr = dict()
  32. j = 0
  33. for i in range(NIKUD_START_ORD, NIKUD_END_ORD + 1):
  34. if i not in SPECIAL_ORDS:
  35. idx2chr[j] = chr(i)
  36. j += 1
  37. def strip_nikud(s):
  38. if type(s) is str:
  39. out = s
  40. for N in EXTENDED_NIKUD:
  41. out = out.replace(N, '')
  42. return out
  43. out = s.copy() # pd Series
  44. for N in EXTENDED_NIKUD:
  45. out = out.str.replace(N, '')
  46. return out
  47. def text_contains_nikud(text):
  48. return len(set(text) & EXTENDED_NIKUD) > 0
  49. ABG = set('אבגדהוזחטיכךלמםנןסעפףצץקרשת')
  50. def text_contains_abg(text):
  51. return len(set(text) & ABG) > 0
  52. # CHARSET = NIKUD | ABG
  53. YUD = 'י'
  54. VAV = 'ו'
  55. YV = YUD + VAV
  56. ### utilities for converting (haser, male) text pairs into input & target for nikud model: ###
  57. # haser: includes nikud, but not extra yuds/vavs
  58. def align_haser_male(haser, male):
  59. '''Input: pairs of texts in ktiv haser (with nikud) and ktiv male
  60. Output: list of pairs (c1, c2) of characters; c1 in haser, c2 in male'''
  61. i = 0
  62. j = 0
  63. output = []
  64. while i < len(haser) and j < len(male):
  65. if i >= len(haser):
  66. output += [('', male[j])]
  67. j += 1
  68. elif j >= len(male):
  69. output += [(haser[i], '')]
  70. i += 1
  71. elif haser[i] == male[j]:
  72. output += [(haser[i], male[j])]
  73. i += 1
  74. j += 1
  75. elif haser[i] in NIKUD:
  76. output += [(haser[i], '')]
  77. i += 1
  78. else:
  79. output += [('', male[j])]
  80. j += 1
  81. return output
  82. def chunk_haser_male(haser, male):
  83. '''uses alignment from previous method to split text into chunks
  84. outputs list of chunks, one chunk has format: (str, bool)
  85. str: Hebrew consonant with vowel(s) attached
  86. bool: True iff letter should be deleted (i.e. extra yud/vav)'''
  87. aligned = align_haser_male(haser, male)
  88. chunks = []
  89. del_flags = []
  90. cur_chunk = ''
  91. for c1, c2 in aligned:
  92. if c1 == c2:
  93. if cur_chunk != '':
  94. chunks.append(cur_chunk)
  95. del_flags.append(False)
  96. cur_chunk = ''
  97. cur_chunk += c1
  98. elif c1 == '':
  99. if cur_chunk != '':
  100. chunks.append(cur_chunk)
  101. del_flags.append(False)
  102. cur_chunk = ''
  103. chunks.append(c2)
  104. del_flags.append(True)
  105. else:
  106. cur_chunk += c1
  107. if cur_chunk != '':
  108. chunks.append(cur_chunk)
  109. del_flags.append(False)
  110. return list(zip(chunks, del_flags))
  111. def chunk2target(chunk):
  112. '''turns chunks from previous method into multilabel targets for nikud model'''
  113. text, del_flag = chunk
  114. nikkud_list = [
  115. int(chr(n) in text)
  116. for n in range(NIKUD_START_ORD, NIKUD_END_ORD + 1)
  117. if n not in SPECIAL_ORDS
  118. ]
  119. return nikkud_list + [int(del_flag)]
  120. def haser_male2target(haser, male):
  121. '''Input: pairs of texts in ktiv haser (with nikud) and ktiv male
  122. Output: multilabel targets for nikud model'''
  123. chunked = chunk_haser_male(haser, male)
  124. return np.vstack([chunk2target(chunk) for chunk in chunked])
  125. if __name__ == '__main__':
  126. haser = 'הַכְּרֻבִים'
  127. male = 'הכרובים'
  128. print(haser)
  129. print(male)
  130. print(chunk_haser_male(haser, male))
  131. print(haser_male2target(haser, male))
  132. print(haser_male2target(haser, male).shape)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...