Coverage for pycommons / strings / chars.py: 100%

12 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-11 03:04 +0000

1"""Constants for common characters.""" 

2 

3from typing import Callable, Final 

4 

5#: A constant for non-breaking space 

6NBSP: Final[str] = "\xa0" 

7#: A non-breaking hyphen 

8NBDASH: Final[str] = "\u2011" 

9 

10#: A regular expression matching all characters that are non-line breaking 

11#: white space. 

12WHITESPACE: Final[str] = ( 

13 "\t\x0b\x0c \xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" 

14 "\u2008\u2009\u200a\u202f\u205f\u3000") 

15 

16#: A regular expression matching all characters that are non-line breaking 

17#: white space. 

18NEWLINE: Final[str] = "\n\r\x85\u2028\u2029" 

19 

20#: A regular expression matching any white space or newline character. 

21WHITESPACE_OR_NEWLINE: Final[str] = ( 

22 "\t\n\x0b\x0c\r \x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006" 

23 "\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000") 

24 

25 

26#: the internal table for converting normal characters to unicode superscripts 

27__SUPERSCRIPT: Final[Callable[[str], str]] = { 

28 # numbers from 0 to 9 

29 "\x30": "\u2070", # 0 

30 "\x31": "\xb9", # 1 

31 "\x32": "\xb2", # 2 

32 "\x33": "\xb3", # 3 

33 "\x34": "\u2074", # 4 

34 "\x35": "\u2075", # 5 

35 "\x36": "\u2076", # 6 

36 "\x37": "\u2077", # 7 

37 "\x38": "\u2078", # 8 

38 "\x39": "\u2079", # 9 

39 # +/-/=/(/) 

40 "\x2b": "\u207A", # + 

41 "\x2d": "\u207b", # - 

42 "\x3d": "\u207c", # = 

43 "\x28": "\u207d", # ( 

44 "\x29": "\u207e", # ) 

45 # upper case letters 

46 "\x41": "\u1d2c", # A 

47 "\x42": "\u1d2e", # B 

48 "\x43": "\ua7f2", # C 

49 "\x44": "\u1d30", # D 

50 "\x45": "\u1d31", # E 

51 "\x46": "\ua7f3", # F 

52 "\x47": "\u1d33", # G 

53 "\x48": "\u1d34", # H 

54 "\x49": "\u1d35", # I 

55 "\x4a": "\u1d36", # J 

56 "\x4b": "\u1d37", # K 

57 "\x4c": "\u1d38", # L 

58 "\x4d": "\u1d39", # M 

59 "\x4e": "\u1d3a", # N 

60 "\x4f": "\u1d3c", # O 

61 "\x50": "\u1d3e", # P 

62 "\x51": "\ua7f4", # Q 

63 "\x52": "\u1d3f", # R 

64 "\x53": "\ua7f1", # S 

65 "\x54": "\u1d40", # T 

66 "\x55": "\u1d41", # U 

67 "\x56": "\u2c7d", # V 

68 "\x57": "\u1d42", # W 

69 # lower case letters 

70 "\x61": "\u1d43", # a 

71 "\x62": "\u1d47", # b 

72 "\x63": "\u1d9c", # c 

73 "\x64": "\u1d48", # d 

74 "\x65": "\u1d49", # e 

75 "\x66": "\u1da0", # f 

76 "\x67": "\u1d4d", # g 

77 "\x68": "\u02b0", # h 

78 "\x69": "\u2071", # i 

79 "\x6a": "\u02b2", # j 

80 "\x6b": "\u1d4f", # k 

81 "\x6c": "\u1da9", # l; alternative": "\u2e1 

82 "\x6d": "\u1d50", # m 

83 "\x6e": "\u207f", # n 

84 "\x6f": "\u1d52", # o 

85 "\x70": "\u1d56", # p 

86 "\x71": "\u107a5", # q 

87 "\x72": "\u02b3", # r 

88 "\x73": "\u02e2", # s 

89 "\x74": "\u1d57", # t 

90 "\x75": "\u1d58", # u 

91 "\x76": "\u1d5b", # v 

92 "\x77": "\u02b7", # w 

93 "\x78": "\u02e3", # x 

94 "\x79": "\u02b8", # y 

95 "\x7a": "\u1dbb", # z 

96 # white space 

97 " ": " ", 

98 "\t": "\t", 

99 "\n": "\n", 

100 "\x0b": "\x0b", 

101 "\x0c": "\x0c", 

102 "\r": "\r", 

103 "\x85": "\x85", 

104 "\xa0": "\xa0", 

105 "\u1680": "\u1680", 

106 "\u2000": "\u2000", 

107 "\u2001": "\u2001", 

108 "\u2002": "\u2002", 

109 "\u2003": "\u2003", 

110 "\u2004": "\u2004", 

111 "\u2005": "\u2005", 

112 "\u2006": "\u2006", 

113 "\u2007": "\u2007", 

114 "\u2008": "\u2008", 

115 "\u2009": "\u2009", 

116 "\u200a": "\u200a", 

117 "\u2028": "\u2028", 

118 "\u2029": "\u2029", 

119 "\u202f": "\u202f", 

120 "\u205f": "\u205f", 

121 "\u3000": "\u3000", 

122}.__getitem__ 

123 

124 

125def superscript(s: str) -> str: 

126 """ 

127 Transform a string into Unicode-based superscript. 

128 

129 All characters that can be represented as superscript in unicode will be 

130 translated to superscript. Notice that only a subset of the latin 

131 characters can be converted to unicode superscropt. If any character 

132 cannot be translated, it will raise a :class:`KeyError`. White space is 

133 preserved. 

134 

135 :param s: the string 

136 :returns: the string in subscript 

137 :raises KeyError: if a character cannot be converted 

138 :raises TypeError: if `s` is not a string 

139 

140 >>> superscript("a0 =4(e)") 

141 '\u1d43\u2070 \u207c\u2074\u207d\u1d49\u207e' 

142 

143 >>> try: 

144 ... superscript("a0=4(e)Y") 

145 ... except KeyError as ke: 

146 ... print(ke) 

147 'Y' 

148 

149 >>> try: 

150 ... superscript(None) 

151 ... except TypeError as te: 

152 ... print(te) 

153 descriptor '__iter__' requires a 'str' object but received a 'NoneType' 

154 

155 >>> try: 

156 ... superscript(1) 

157 ... except TypeError as te: 

158 ... print(te) 

159 descriptor '__iter__' requires a 'str' object but received a 'int' 

160 """ 

161 return "".join(map(__SUPERSCRIPT, str.__iter__(s))) 

162 

163 

164#: the internal table for converting normal characters to unicode subscripts 

165__SUBSCRIPT: Final[Callable[[str], str]] = { 

166 # numbers from 0 to 9 

167 "\x30": "\u2080", # 0 

168 "\x31": "\u2081", # 1 

169 "\x32": "\u2082", # 2 

170 "\x33": "\u2083", # 3 

171 "\x34": "\u2084", # 4 

172 "\x35": "\u2085", # 5 

173 "\x36": "\u2086", # 6 

174 "\x37": "\u2087", # 7 

175 "\x38": "\u2088", # 8 

176 "\x39": "\u2089", # 9 

177 # +/-/=/(/) 

178 "\x2b": "\u208a", # + 

179 "\x2d": "\u208b", # - 

180 "\x3d": "\u208c", # = 

181 "\x28": "\u208d", # ( 

182 "\x29": "\u208e", # ) 

183 # lower case letters 

184 "\x61": "\u2090", # a 

185 "\x65": "\u2091", # e 

186 "\x68": "\u2095", # h 

187 "\x69": "\u1d62", # i 

188 "\x6a": "\u2c7c", # j 

189 "\x6b": "\u2096", # k 

190 "\x6c": "\u2097", # l 

191 "\x6d": "\u2098", # m 

192 "\x6e": "\u2099", # n 

193 "\x6f": "\u2092", # o 

194 "\x70": "\u209a", # p 

195 "\x73": "\u209b", # s 

196 "\x74": "\u209c", # t 

197 "\x75": "\u1d64", # u 

198 "\x76": "\u1d65", # v 

199 "\x78": "\u2093", # x 

200 "\u018f": "\u2094", # letter schwa", upside-down "e" 

201 # white space 

202 " ": " ", 

203 "\t": "\t", 

204 "\n": "\n", 

205 "\x0b": "\x0b", 

206 "\x0c": "\x0c", 

207 "\r": "\r", 

208 "\x85": "\x85", 

209 "\xa0": "\xa0", 

210 "\u1680": "\u1680", 

211 "\u2000": "\u2000", 

212 "\u2001": "\u2001", 

213 "\u2002": "\u2002", 

214 "\u2003": "\u2003", 

215 "\u2004": "\u2004", 

216 "\u2005": "\u2005", 

217 "\u2006": "\u2006", 

218 "\u2007": "\u2007", 

219 "\u2008": "\u2008", 

220 "\u2009": "\u2009", 

221 "\u200a": "\u200a", 

222 "\u2028": "\u2028", 

223 "\u2029": "\u2029", 

224 "\u202f": "\u202f", 

225 "\u205f": "\u205f", 

226 "\u3000": "\u3000", 

227}.__getitem__ 

228 

229 

230def subscript(s: str) -> str: 

231 """ 

232 Transform a string into Unicode-based subscript. 

233 

234 All characters that can be represented as subscript in unicode will be 

235 translated to subscript. Notice that only a subset of the latin 

236 characters can be converted to unicode subscript. If any character 

237 cannot be translated, it will raise a :class:`KeyError`. White space is 

238 preserved. 

239 

240 :param s: the string 

241 :returns: the string in subscript 

242 :raises KeyError: if a character cannot be converted 

243 :raises TypeError: if `s` is not a string 

244 

245 >>> subscript("a0= 4(e)") 

246 '\u2090\u2080\u208c \u2084\u208d\u2091\u208e' 

247 

248 >>> try: 

249 ... subscript("a0=4(e)Y") 

250 ... except KeyError as ke: 

251 ... print(ke) 

252 'Y' 

253 

254 >>> try: 

255 ... subscript(None) 

256 ... except TypeError as te: 

257 ... print(te) 

258 descriptor '__iter__' requires a 'str' object but received a 'NoneType' 

259 

260 >>> try: 

261 ... superscript(1) 

262 ... except TypeError as te: 

263 ... print(te) 

264 descriptor '__iter__' requires a 'str' object but received a 'int' 

265 """ 

266 return "".join(map(__SUBSCRIPT, str.__iter__(s)))