Mercurial > lcfOS
comparison cos/python/Objects/unicodeobject.c @ 27:7f74363f4c82
Added some files for the python port
author | windel |
---|---|
date | Tue, 27 Dec 2011 18:59:02 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
26:dcce92b1efbc | 27:7f74363f4c82 |
---|---|
1 /* | |
2 | |
3 Unicode implementation based on original code by Fredrik Lundh, | |
4 modified by Marc-Andre Lemburg <mal@lemburg.com>. | |
5 | |
6 Major speed upgrades to the method implementations at the Reykjavik | |
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. | |
8 | |
9 Copyright (c) Corporation for National Research Initiatives. | |
10 | |
11 -------------------------------------------------------------------- | |
12 The original string type implementation is: | |
13 | |
14 Copyright (c) 1999 by Secret Labs AB | |
15 Copyright (c) 1999 by Fredrik Lundh | |
16 | |
17 By obtaining, using, and/or copying this software and/or its | |
18 associated documentation, you agree that you have read, understood, | |
19 and will comply with the following terms and conditions: | |
20 | |
21 Permission to use, copy, modify, and distribute this software and its | |
22 associated documentation for any purpose and without fee is hereby | |
23 granted, provided that the above copyright notice appears in all | |
24 copies, and that both that copyright notice and this permission notice | |
25 appear in supporting documentation, and that the name of Secret Labs | |
26 AB or the author not be used in advertising or publicity pertaining to | |
27 distribution of the software without specific, written prior | |
28 permission. | |
29 | |
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO | |
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND | |
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR | |
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT | |
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
37 -------------------------------------------------------------------- | |
38 | |
39 */ | |
40 | |
41 #define PY_SSIZE_T_CLEAN | |
42 #include "Python.h" | |
43 #include "ucnhash.h" | |
44 | |
45 /* Endianness switches; defaults to little endian */ | |
46 | |
47 #ifdef WORDS_BIGENDIAN | |
48 # define BYTEORDER_IS_BIG_ENDIAN | |
49 #else | |
50 # define BYTEORDER_IS_LITTLE_ENDIAN | |
51 #endif | |
52 | |
53 /* --- Globals ------------------------------------------------------------ | |
54 | |
55 The globals are initialized by the _PyUnicode_Init() API and should | |
56 not be used before calling that API. | |
57 | |
58 */ | |
59 | |
60 | |
61 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ | |
62 #define MAX_UNICODE 0x10ffff | |
63 | |
64 #ifdef Py_DEBUG | |
65 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) | |
66 #else | |
67 # define _PyUnicode_CHECK(op) PyUnicode_Check(op) | |
68 #endif | |
69 | |
70 #define _PyUnicode_UTF8(op) \ | |
71 (((PyCompactUnicodeObject*)(op))->utf8) | |
72 #define PyUnicode_UTF8(op) \ | |
73 (assert(_PyUnicode_CHECK(op)), \ | |
74 assert(PyUnicode_IS_READY(op)), \ | |
75 PyUnicode_IS_COMPACT_ASCII(op) ? \ | |
76 ((char*)((PyASCIIObject*)(op) + 1)) : \ | |
77 _PyUnicode_UTF8(op)) | |
78 #define _PyUnicode_UTF8_LENGTH(op) \ | |
79 (((PyCompactUnicodeObject*)(op))->utf8_length) | |
80 #define PyUnicode_UTF8_LENGTH(op) \ | |
81 (assert(_PyUnicode_CHECK(op)), \ | |
82 assert(PyUnicode_IS_READY(op)), \ | |
83 PyUnicode_IS_COMPACT_ASCII(op) ? \ | |
84 ((PyASCIIObject*)(op))->length : \ | |
85 _PyUnicode_UTF8_LENGTH(op)) | |
86 #define _PyUnicode_WSTR(op) \ | |
87 (((PyASCIIObject*)(op))->wstr) | |
88 #define _PyUnicode_WSTR_LENGTH(op) \ | |
89 (((PyCompactUnicodeObject*)(op))->wstr_length) | |
90 #define _PyUnicode_LENGTH(op) \ | |
91 (((PyASCIIObject *)(op))->length) | |
92 #define _PyUnicode_STATE(op) \ | |
93 (((PyASCIIObject *)(op))->state) | |
94 #define _PyUnicode_HASH(op) \ | |
95 (((PyASCIIObject *)(op))->hash) | |
96 #define _PyUnicode_KIND(op) \ | |
97 (assert(_PyUnicode_CHECK(op)), \ | |
98 ((PyASCIIObject *)(op))->state.kind) | |
99 #define _PyUnicode_GET_LENGTH(op) \ | |
100 (assert(_PyUnicode_CHECK(op)), \ | |
101 ((PyASCIIObject *)(op))->length) | |
102 #define _PyUnicode_DATA_ANY(op) \ | |
103 (((PyUnicodeObject*)(op))->data.any) | |
104 | |
105 #undef PyUnicode_READY | |
106 #define PyUnicode_READY(op) \ | |
107 (assert(_PyUnicode_CHECK(op)), \ | |
108 (PyUnicode_IS_READY(op) ? \ | |
109 0 : \ | |
110 _PyUnicode_Ready(op))) | |
111 | |
112 #define _PyUnicode_SHARE_UTF8(op) \ | |
113 (assert(_PyUnicode_CHECK(op)), \ | |
114 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ | |
115 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) | |
116 #define _PyUnicode_SHARE_WSTR(op) \ | |
117 (assert(_PyUnicode_CHECK(op)), \ | |
118 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) | |
119 | |
120 /* true if the Unicode object has an allocated UTF-8 memory block | |
121 (not shared with other data) */ | |
122 #define _PyUnicode_HAS_UTF8_MEMORY(op) \ | |
123 (assert(_PyUnicode_CHECK(op)), \ | |
124 (!PyUnicode_IS_COMPACT_ASCII(op) \ | |
125 && _PyUnicode_UTF8(op) \ | |
126 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) | |
127 | |
128 /* true if the Unicode object has an allocated wstr memory block | |
129 (not shared with other data) */ | |
130 #define _PyUnicode_HAS_WSTR_MEMORY(op) \ | |
131 (assert(_PyUnicode_CHECK(op)), \ | |
132 (_PyUnicode_WSTR(op) && \ | |
133 (!PyUnicode_IS_READY(op) || \ | |
134 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) | |
135 | |
136 /* Generic helper macro to convert characters of different types. | |
137 from_type and to_type have to be valid type names, begin and end | |
138 are pointers to the source characters which should be of type | |
139 "from_type *". to is a pointer of type "to_type *" and points to the | |
140 buffer where the result characters are written to. */ | |
141 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ | |
142 do { \ | |
143 to_type *_to = (to_type *) to; \ | |
144 const from_type *_iter = (begin); \ | |
145 const from_type *_end = (end); \ | |
146 Py_ssize_t n = (_end) - (_iter); \ | |
147 const from_type *_unrolled_end = \ | |
148 _iter + (n & ~ (Py_ssize_t) 3); \ | |
149 while (_iter < (_unrolled_end)) { \ | |
150 _to[0] = (to_type) _iter[0]; \ | |
151 _to[1] = (to_type) _iter[1]; \ | |
152 _to[2] = (to_type) _iter[2]; \ | |
153 _to[3] = (to_type) _iter[3]; \ | |
154 _iter += 4; _to += 4; \ | |
155 } \ | |
156 while (_iter < (_end)) \ | |
157 *_to++ = (to_type) *_iter++; \ | |
158 } while (0) | |
159 | |
160 /* The Unicode string has been modified: reset the hash */ | |
161 #define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) | |
162 | |
163 /* This dictionary holds all interned unicode strings. Note that references | |
164 to strings in this dictionary are *not* counted in the string's ob_refcnt. | |
165 When the interned string reaches a refcnt of 0 the string deallocation | |
166 function will delete the reference from this dictionary. | |
167 | |
168 Another way to look at this is that to say that the actual reference | |
169 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) | |
170 */ | |
171 static PyObject *interned; | |
172 | |
173 /* The empty Unicode object is shared to improve performance. */ | |
174 static PyObject *unicode_empty; | |
175 | |
176 /* List of static strings. */ | |
177 static _Py_Identifier *static_strings; | |
178 | |
179 /* Single character Unicode strings in the Latin-1 range are being | |
180 shared as well. */ | |
181 static PyObject *unicode_latin1[256]; | |
182 | |
183 /* Fast detection of the most frequent whitespace characters */ | |
184 const unsigned char _Py_ascii_whitespace[] = { | |
185 0, 0, 0, 0, 0, 0, 0, 0, | |
186 /* case 0x0009: * CHARACTER TABULATION */ | |
187 /* case 0x000A: * LINE FEED */ | |
188 /* case 0x000B: * LINE TABULATION */ | |
189 /* case 0x000C: * FORM FEED */ | |
190 /* case 0x000D: * CARRIAGE RETURN */ | |
191 0, 1, 1, 1, 1, 1, 0, 0, | |
192 0, 0, 0, 0, 0, 0, 0, 0, | |
193 /* case 0x001C: * FILE SEPARATOR */ | |
194 /* case 0x001D: * GROUP SEPARATOR */ | |
195 /* case 0x001E: * RECORD SEPARATOR */ | |
196 /* case 0x001F: * UNIT SEPARATOR */ | |
197 0, 0, 0, 0, 1, 1, 1, 1, | |
198 /* case 0x0020: * SPACE */ | |
199 1, 0, 0, 0, 0, 0, 0, 0, | |
200 0, 0, 0, 0, 0, 0, 0, 0, | |
201 0, 0, 0, 0, 0, 0, 0, 0, | |
202 0, 0, 0, 0, 0, 0, 0, 0, | |
203 | |
204 0, 0, 0, 0, 0, 0, 0, 0, | |
205 0, 0, 0, 0, 0, 0, 0, 0, | |
206 0, 0, 0, 0, 0, 0, 0, 0, | |
207 0, 0, 0, 0, 0, 0, 0, 0, | |
208 0, 0, 0, 0, 0, 0, 0, 0, | |
209 0, 0, 0, 0, 0, 0, 0, 0, | |
210 0, 0, 0, 0, 0, 0, 0, 0, | |
211 0, 0, 0, 0, 0, 0, 0, 0 | |
212 }; | |
213 | |
214 /* forward */ | |
215 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); | |
216 static PyObject* get_latin1_char(unsigned char ch); | |
217 static void copy_characters( | |
218 PyObject *to, Py_ssize_t to_start, | |
219 PyObject *from, Py_ssize_t from_start, | |
220 Py_ssize_t how_many); | |
221 | |
222 static PyObject * | |
223 unicode_fromascii(const unsigned char *s, Py_ssize_t size); | |
224 static PyObject * | |
225 _PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); | |
226 static PyObject * | |
227 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); | |
228 static PyObject * | |
229 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); | |
230 | |
231 static PyObject * | |
232 unicode_encode_call_errorhandler(const char *errors, | |
233 PyObject **errorHandler,const char *encoding, const char *reason, | |
234 PyObject *unicode, PyObject **exceptionObject, | |
235 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); | |
236 | |
237 static void | |
238 raise_encode_exception(PyObject **exceptionObject, | |
239 const char *encoding, | |
240 PyObject *unicode, | |
241 Py_ssize_t startpos, Py_ssize_t endpos, | |
242 const char *reason); | |
243 | |
244 /* Same for linebreaks */ | |
245 static unsigned char ascii_linebreak[] = { | |
246 0, 0, 0, 0, 0, 0, 0, 0, | |
247 /* 0x000A, * LINE FEED */ | |
248 /* 0x000B, * LINE TABULATION */ | |
249 /* 0x000C, * FORM FEED */ | |
250 /* 0x000D, * CARRIAGE RETURN */ | |
251 0, 0, 1, 1, 1, 1, 0, 0, | |
252 0, 0, 0, 0, 0, 0, 0, 0, | |
253 /* 0x001C, * FILE SEPARATOR */ | |
254 /* 0x001D, * GROUP SEPARATOR */ | |
255 /* 0x001E, * RECORD SEPARATOR */ | |
256 0, 0, 0, 0, 1, 1, 1, 0, | |
257 0, 0, 0, 0, 0, 0, 0, 0, | |
258 0, 0, 0, 0, 0, 0, 0, 0, | |
259 0, 0, 0, 0, 0, 0, 0, 0, | |
260 0, 0, 0, 0, 0, 0, 0, 0, | |
261 | |
262 0, 0, 0, 0, 0, 0, 0, 0, | |
263 0, 0, 0, 0, 0, 0, 0, 0, | |
264 0, 0, 0, 0, 0, 0, 0, 0, | |
265 0, 0, 0, 0, 0, 0, 0, 0, | |
266 0, 0, 0, 0, 0, 0, 0, 0, | |
267 0, 0, 0, 0, 0, 0, 0, 0, | |
268 0, 0, 0, 0, 0, 0, 0, 0, | |
269 0, 0, 0, 0, 0, 0, 0, 0 | |
270 }; | |
271 | |
272 /* The max unicode value is always 0x10FFFF while using the PEP-393 API. | |
273 This function is kept for backward compatibility with the old API. */ | |
274 Py_UNICODE | |
275 PyUnicode_GetMax(void) | |
276 { | |
277 #ifdef Py_UNICODE_WIDE | |
278 return 0x10FFFF; | |
279 #else | |
280 /* This is actually an illegal character, so it should | |
281 not be passed to unichr. */ | |
282 return 0xFFFF; | |
283 #endif | |
284 } | |
285 | |
286 #ifdef Py_DEBUG | |
287 int | |
288 _PyUnicode_CheckConsistency(PyObject *op, int check_content) | |
289 { | |
290 PyASCIIObject *ascii; | |
291 unsigned int kind; | |
292 | |
293 assert(PyUnicode_Check(op)); | |
294 | |
295 ascii = (PyASCIIObject *)op; | |
296 kind = ascii->state.kind; | |
297 | |
298 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { | |
299 assert(kind == PyUnicode_1BYTE_KIND); | |
300 assert(ascii->state.ready == 1); | |
301 } | |
302 else { | |
303 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; | |
304 void *data; | |
305 | |
306 if (ascii->state.compact == 1) { | |
307 data = compact + 1; | |
308 assert(kind == PyUnicode_1BYTE_KIND | |
309 || kind == PyUnicode_2BYTE_KIND | |
310 || kind == PyUnicode_4BYTE_KIND); | |
311 assert(ascii->state.ascii == 0); | |
312 assert(ascii->state.ready == 1); | |
313 assert (compact->utf8 != data); | |
314 } | |
315 else { | |
316 PyUnicodeObject *unicode = (PyUnicodeObject *)op; | |
317 | |
318 data = unicode->data.any; | |
319 if (kind == PyUnicode_WCHAR_KIND) { | |
320 assert(ascii->length == 0); | |
321 assert(ascii->hash == -1); | |
322 assert(ascii->state.compact == 0); | |
323 assert(ascii->state.ascii == 0); | |
324 assert(ascii->state.ready == 0); | |
325 assert(ascii->state.interned == SSTATE_NOT_INTERNED); | |
326 assert(ascii->wstr != NULL); | |
327 assert(data == NULL); | |
328 assert(compact->utf8 == NULL); | |
329 } | |
330 else { | |
331 assert(kind == PyUnicode_1BYTE_KIND | |
332 || kind == PyUnicode_2BYTE_KIND | |
333 || kind == PyUnicode_4BYTE_KIND); | |
334 assert(ascii->state.compact == 0); | |
335 assert(ascii->state.ready == 1); | |
336 assert(data != NULL); | |
337 if (ascii->state.ascii) { | |
338 assert (compact->utf8 == data); | |
339 assert (compact->utf8_length == ascii->length); | |
340 } | |
341 else | |
342 assert (compact->utf8 != data); | |
343 } | |
344 } | |
345 if (kind != PyUnicode_WCHAR_KIND) { | |
346 if ( | |
347 kind == PyUnicode_4BYTE_KIND | |
348 ) | |
349 { | |
350 assert(ascii->wstr == data); | |
351 assert(compact->wstr_length == ascii->length); | |
352 } else | |
353 assert(ascii->wstr != data); | |
354 } | |
355 | |
356 if (compact->utf8 == NULL) | |
357 assert(compact->utf8_length == 0); | |
358 if (ascii->wstr == NULL) | |
359 assert(compact->wstr_length == 0); | |
360 } | |
361 /* check that the best kind is used */ | |
362 if (check_content && kind != PyUnicode_WCHAR_KIND) | |
363 { | |
364 Py_ssize_t i; | |
365 Py_UCS4 maxchar = 0; | |
366 void *data = PyUnicode_DATA(ascii); | |
367 for (i=0; i < ascii->length; i++) | |
368 { | |
369 Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
370 if (ch > maxchar) | |
371 maxchar = ch; | |
372 } | |
373 if (kind == PyUnicode_1BYTE_KIND) { | |
374 if (ascii->state.ascii == 0) { | |
375 assert(maxchar >= 128); | |
376 assert(maxchar <= 255); | |
377 } | |
378 else | |
379 assert(maxchar < 128); | |
380 } | |
381 else if (kind == PyUnicode_2BYTE_KIND) { | |
382 assert(maxchar >= 0x100); | |
383 assert(maxchar <= 0xFFFF); | |
384 } | |
385 else { | |
386 assert(maxchar >= 0x10000); | |
387 assert(maxchar <= MAX_UNICODE); | |
388 } | |
389 } | |
390 return 1; | |
391 } | |
392 #endif | |
393 | |
394 static PyObject* | |
395 unicode_result_wchar(PyObject *unicode) | |
396 { | |
397 #ifndef Py_DEBUG | |
398 Py_ssize_t len; | |
399 | |
400 assert(Py_REFCNT(unicode) == 1); | |
401 | |
402 len = _PyUnicode_WSTR_LENGTH(unicode); | |
403 if (len == 0) { | |
404 Py_INCREF(unicode_empty); | |
405 Py_DECREF(unicode); | |
406 return unicode_empty; | |
407 } | |
408 | |
409 if (len == 1) { | |
410 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; | |
411 if (ch < 256) { | |
412 PyObject *latin1_char = get_latin1_char((unsigned char)ch); | |
413 Py_DECREF(unicode); | |
414 return latin1_char; | |
415 } | |
416 } | |
417 | |
418 if (_PyUnicode_Ready(unicode) < 0) { | |
419 Py_XDECREF(unicode); | |
420 return NULL; | |
421 } | |
422 #else | |
423 /* don't make the result ready in debug mode to ensure that the caller | |
424 makes the string ready before using it */ | |
425 assert(_PyUnicode_CheckConsistency(unicode, 1)); | |
426 #endif | |
427 return unicode; | |
428 } | |
429 | |
430 static PyObject* | |
431 unicode_result_ready(PyObject *unicode) | |
432 { | |
433 Py_ssize_t length; | |
434 | |
435 length = PyUnicode_GET_LENGTH(unicode); | |
436 if (length == 0) { | |
437 if (unicode != unicode_empty) { | |
438 Py_INCREF(unicode_empty); | |
439 Py_DECREF(unicode); | |
440 } | |
441 return unicode_empty; | |
442 } | |
443 | |
444 if (length == 1) { | |
445 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); | |
446 if (ch < 256) { | |
447 PyObject *latin1_char = unicode_latin1[ch]; | |
448 if (latin1_char != NULL) { | |
449 if (unicode != latin1_char) { | |
450 Py_INCREF(latin1_char); | |
451 Py_DECREF(unicode); | |
452 } | |
453 return latin1_char; | |
454 } | |
455 else { | |
456 assert(_PyUnicode_CheckConsistency(unicode, 1)); | |
457 Py_INCREF(unicode); | |
458 unicode_latin1[ch] = unicode; | |
459 return unicode; | |
460 } | |
461 } | |
462 } | |
463 | |
464 assert(_PyUnicode_CheckConsistency(unicode, 1)); | |
465 return unicode; | |
466 } | |
467 | |
468 static PyObject* | |
469 unicode_result(PyObject *unicode) | |
470 { | |
471 assert(_PyUnicode_CHECK(unicode)); | |
472 if (PyUnicode_IS_READY(unicode)) | |
473 return unicode_result_ready(unicode); | |
474 else | |
475 return unicode_result_wchar(unicode); | |
476 } | |
477 | |
478 #ifdef HAVE_MBCS | |
479 static OSVERSIONINFOEX winver; | |
480 #endif | |
481 | |
482 /* --- Bloom Filters ----------------------------------------------------- */ | |
483 | |
484 /* stuff to implement simple "bloom filters" for Unicode characters. | |
485 to keep things simple, we use a single bitmask, using the least 5 | |
486 bits from each unicode characters as the bit index. */ | |
487 | |
488 /* the linebreak mask is set up by Unicode_Init below */ | |
489 | |
490 #if LONG_BIT >= 128 | |
491 #define BLOOM_WIDTH 128 | |
492 #elif LONG_BIT >= 64 | |
493 #define BLOOM_WIDTH 64 | |
494 #elif LONG_BIT >= 32 | |
495 #define BLOOM_WIDTH 32 | |
496 #else | |
497 #error "LONG_BIT is smaller than 32" | |
498 #endif | |
499 | |
500 #define BLOOM_MASK unsigned long | |
501 | |
502 static BLOOM_MASK bloom_linebreak; | |
503 | |
504 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) | |
505 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) | |
506 | |
507 #define BLOOM_LINEBREAK(ch) \ | |
508 ((ch) < 128U ? ascii_linebreak[(ch)] : \ | |
509 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) | |
510 | |
511 Py_LOCAL_INLINE(BLOOM_MASK) | |
512 make_bloom_mask(int kind, void* ptr, Py_ssize_t len) | |
513 { | |
514 /* calculate simple bloom-style bitmask for a given unicode string */ | |
515 | |
516 BLOOM_MASK mask; | |
517 Py_ssize_t i; | |
518 | |
519 mask = 0; | |
520 for (i = 0; i < len; i++) | |
521 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); | |
522 | |
523 return mask; | |
524 } | |
525 | |
526 #define BLOOM_MEMBER(mask, chr, str) \ | |
527 (BLOOM(mask, chr) \ | |
528 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) | |
529 | |
530 /* Compilation of templated routines */ | |
531 | |
532 #include "stringlib/asciilib.h" | |
533 #include "stringlib/fastsearch.h" | |
534 #include "stringlib/partition.h" | |
535 #include "stringlib/split.h" | |
536 #include "stringlib/count.h" | |
537 #include "stringlib/find.h" | |
538 #include "stringlib/find_max_char.h" | |
539 #include "stringlib/localeutil.h" | |
540 #include "stringlib/undef.h" | |
541 | |
542 #include "stringlib/ucs1lib.h" | |
543 #include "stringlib/fastsearch.h" | |
544 #include "stringlib/partition.h" | |
545 #include "stringlib/split.h" | |
546 #include "stringlib/count.h" | |
547 #include "stringlib/find.h" | |
548 #include "stringlib/find_max_char.h" | |
549 #include "stringlib/localeutil.h" | |
550 #include "stringlib/undef.h" | |
551 | |
552 #include "stringlib/ucs2lib.h" | |
553 #include "stringlib/fastsearch.h" | |
554 #include "stringlib/partition.h" | |
555 #include "stringlib/split.h" | |
556 #include "stringlib/count.h" | |
557 #include "stringlib/find.h" | |
558 #include "stringlib/find_max_char.h" | |
559 #include "stringlib/localeutil.h" | |
560 #include "stringlib/undef.h" | |
561 | |
562 #include "stringlib/ucs4lib.h" | |
563 #include "stringlib/fastsearch.h" | |
564 #include "stringlib/partition.h" | |
565 #include "stringlib/split.h" | |
566 #include "stringlib/count.h" | |
567 #include "stringlib/find.h" | |
568 #include "stringlib/find_max_char.h" | |
569 #include "stringlib/localeutil.h" | |
570 #include "stringlib/undef.h" | |
571 | |
572 #include "stringlib/unicodedefs.h" | |
573 #include "stringlib/fastsearch.h" | |
574 #include "stringlib/count.h" | |
575 #include "stringlib/find.h" | |
576 #include "stringlib/undef.h" | |
577 | |
578 /* --- Unicode Object ----------------------------------------------------- */ | |
579 | |
580 static PyObject * | |
581 fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); | |
582 | |
583 Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, | |
584 Py_ssize_t size, Py_UCS4 ch, | |
585 int direction) | |
586 { | |
587 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; | |
588 | |
589 switch (kind) { | |
590 case PyUnicode_1BYTE_KIND: | |
591 { | |
592 Py_UCS1 ch1 = (Py_UCS1) ch; | |
593 if (ch1 == ch) | |
594 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); | |
595 else | |
596 return -1; | |
597 } | |
598 case PyUnicode_2BYTE_KIND: | |
599 { | |
600 Py_UCS2 ch2 = (Py_UCS2) ch; | |
601 if (ch2 == ch) | |
602 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); | |
603 else | |
604 return -1; | |
605 } | |
606 case PyUnicode_4BYTE_KIND: | |
607 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); | |
608 default: | |
609 assert(0); | |
610 return -1; | |
611 } | |
612 } | |
613 | |
614 static PyObject* | |
615 resize_compact(PyObject *unicode, Py_ssize_t length) | |
616 { | |
617 Py_ssize_t char_size; | |
618 Py_ssize_t struct_size; | |
619 Py_ssize_t new_size; | |
620 int share_wstr; | |
621 | |
622 assert(PyUnicode_IS_READY(unicode)); | |
623 char_size = PyUnicode_KIND(unicode); | |
624 if (PyUnicode_IS_COMPACT_ASCII(unicode)) | |
625 struct_size = sizeof(PyASCIIObject); | |
626 else | |
627 struct_size = sizeof(PyCompactUnicodeObject); | |
628 share_wstr = _PyUnicode_SHARE_WSTR(unicode); | |
629 | |
630 _Py_DEC_REFTOTAL; | |
631 _Py_ForgetReference(unicode); | |
632 | |
633 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { | |
634 PyErr_NoMemory(); | |
635 return NULL; | |
636 } | |
637 new_size = (struct_size + (length + 1) * char_size); | |
638 | |
639 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); | |
640 if (unicode == NULL) { | |
641 PyObject_Del(unicode); | |
642 PyErr_NoMemory(); | |
643 return NULL; | |
644 } | |
645 _Py_NewReference(unicode); | |
646 _PyUnicode_LENGTH(unicode) = length; | |
647 if (share_wstr) { | |
648 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); | |
649 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) | |
650 _PyUnicode_WSTR_LENGTH(unicode) = length; | |
651 } | |
652 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), | |
653 length, 0); | |
654 return unicode; | |
655 } | |
656 | |
657 static int | |
658 resize_inplace(PyObject *unicode, Py_ssize_t length) | |
659 { | |
660 wchar_t *wstr; | |
661 assert(!PyUnicode_IS_COMPACT(unicode)); | |
662 assert(Py_REFCNT(unicode) == 1); | |
663 | |
664 _PyUnicode_DIRTY(unicode); | |
665 | |
666 if (PyUnicode_IS_READY(unicode)) { | |
667 Py_ssize_t char_size; | |
668 Py_ssize_t new_size; | |
669 int share_wstr, share_utf8; | |
670 void *data; | |
671 | |
672 data = _PyUnicode_DATA_ANY(unicode); | |
673 assert(data != NULL); | |
674 char_size = PyUnicode_KIND(unicode); | |
675 share_wstr = _PyUnicode_SHARE_WSTR(unicode); | |
676 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); | |
677 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) | |
678 { | |
679 PyObject_DEL(_PyUnicode_UTF8(unicode)); | |
680 _PyUnicode_UTF8(unicode) = NULL; | |
681 _PyUnicode_UTF8_LENGTH(unicode) = 0; | |
682 } | |
683 | |
684 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { | |
685 PyErr_NoMemory(); | |
686 return -1; | |
687 } | |
688 new_size = (length + 1) * char_size; | |
689 | |
690 data = (PyObject *)PyObject_REALLOC(data, new_size); | |
691 if (data == NULL) { | |
692 PyErr_NoMemory(); | |
693 return -1; | |
694 } | |
695 _PyUnicode_DATA_ANY(unicode) = data; | |
696 if (share_wstr) { | |
697 _PyUnicode_WSTR(unicode) = data; | |
698 _PyUnicode_WSTR_LENGTH(unicode) = length; | |
699 } | |
700 if (share_utf8) { | |
701 _PyUnicode_UTF8(unicode) = data; | |
702 _PyUnicode_UTF8_LENGTH(unicode) = length; | |
703 } | |
704 _PyUnicode_LENGTH(unicode) = length; | |
705 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); | |
706 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { | |
707 assert(_PyUnicode_CheckConsistency(unicode, 0)); | |
708 return 0; | |
709 } | |
710 } | |
711 assert(_PyUnicode_WSTR(unicode) != NULL); | |
712 | |
713 /* check for integer overflow */ | |
714 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { | |
715 PyErr_NoMemory(); | |
716 return -1; | |
717 } | |
718 wstr = _PyUnicode_WSTR(unicode); | |
719 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1)); | |
720 if (!wstr) { | |
721 PyErr_NoMemory(); | |
722 return -1; | |
723 } | |
724 _PyUnicode_WSTR(unicode) = wstr; | |
725 _PyUnicode_WSTR(unicode)[length] = 0; | |
726 _PyUnicode_WSTR_LENGTH(unicode) = length; | |
727 assert(_PyUnicode_CheckConsistency(unicode, 0)); | |
728 return 0; | |
729 } | |
730 | |
731 static PyObject* | |
732 resize_copy(PyObject *unicode, Py_ssize_t length) | |
733 { | |
734 Py_ssize_t copy_length; | |
735 if (PyUnicode_IS_COMPACT(unicode)) { | |
736 PyObject *copy; | |
737 assert(PyUnicode_IS_READY(unicode)); | |
738 | |
739 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); | |
740 if (copy == NULL) | |
741 return NULL; | |
742 | |
743 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); | |
744 copy_characters(copy, 0, unicode, 0, copy_length); | |
745 return copy; | |
746 } | |
747 else { | |
748 PyObject *w; | |
749 assert(_PyUnicode_WSTR(unicode) != NULL); | |
750 assert(_PyUnicode_DATA_ANY(unicode) == NULL); | |
751 w = (PyObject*)_PyUnicode_New(length); | |
752 if (w == NULL) | |
753 return NULL; | |
754 copy_length = _PyUnicode_WSTR_LENGTH(unicode); | |
755 copy_length = Py_MIN(copy_length, length); | |
756 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), | |
757 copy_length); | |
758 return w; | |
759 } | |
760 } | |
761 | |
762 /* We allocate one more byte to make sure the string is | |
763 Ux0000 terminated; some code (e.g. new_identifier) | |
764 relies on that. | |
765 | |
766 XXX This allocator could further be enhanced by assuring that the | |
767 free list never reduces its size below 1. | |
768 | |
769 */ | |
770 | |
771 #ifdef Py_DEBUG | |
772 static int unicode_old_new_calls = 0; | |
773 #endif | |
774 | |
775 static PyUnicodeObject * | |
776 _PyUnicode_New(Py_ssize_t length) | |
777 { | |
778 register PyUnicodeObject *unicode; | |
779 size_t new_size; | |
780 | |
781 /* Optimization for empty strings */ | |
782 if (length == 0 && unicode_empty != NULL) { | |
783 Py_INCREF(unicode_empty); | |
784 return (PyUnicodeObject*)unicode_empty; | |
785 } | |
786 | |
787 /* Ensure we won't overflow the size. */ | |
788 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { | |
789 return (PyUnicodeObject *)PyErr_NoMemory(); | |
790 } | |
791 if (length < 0) { | |
792 PyErr_SetString(PyExc_SystemError, | |
793 "Negative size passed to _PyUnicode_New"); | |
794 return NULL; | |
795 } | |
796 | |
797 #ifdef Py_DEBUG | |
798 ++unicode_old_new_calls; | |
799 #endif | |
800 | |
801 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); | |
802 if (unicode == NULL) | |
803 return NULL; | |
804 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); | |
805 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); | |
806 if (!_PyUnicode_WSTR(unicode)) { | |
807 PyErr_NoMemory(); | |
808 goto onError; | |
809 } | |
810 | |
811 /* Initialize the first element to guard against cases where | |
812 * the caller fails before initializing str -- unicode_resize() | |
813 * reads str[0], and the Keep-Alive optimization can keep memory | |
814 * allocated for str alive across a call to unicode_dealloc(unicode). | |
815 * We don't want unicode_resize to read uninitialized memory in | |
816 * that case. | |
817 */ | |
818 _PyUnicode_WSTR(unicode)[0] = 0; | |
819 _PyUnicode_WSTR(unicode)[length] = 0; | |
820 _PyUnicode_WSTR_LENGTH(unicode) = length; | |
821 _PyUnicode_HASH(unicode) = -1; | |
822 _PyUnicode_STATE(unicode).interned = 0; | |
823 _PyUnicode_STATE(unicode).kind = 0; | |
824 _PyUnicode_STATE(unicode).compact = 0; | |
825 _PyUnicode_STATE(unicode).ready = 0; | |
826 _PyUnicode_STATE(unicode).ascii = 0; | |
827 _PyUnicode_DATA_ANY(unicode) = NULL; | |
828 _PyUnicode_LENGTH(unicode) = 0; | |
829 _PyUnicode_UTF8(unicode) = NULL; | |
830 _PyUnicode_UTF8_LENGTH(unicode) = 0; | |
831 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); | |
832 return unicode; | |
833 | |
834 onError: | |
835 /* XXX UNREF/NEWREF interface should be more symmetrical */ | |
836 _Py_DEC_REFTOTAL; | |
837 _Py_ForgetReference((PyObject *)unicode); | |
838 PyObject_Del(unicode); | |
839 return NULL; | |
840 } | |
841 | |
842 static const char* | |
843 unicode_kind_name(PyObject *unicode) | |
844 { | |
845 /* don't check consistency: unicode_kind_name() is called from | |
846 _PyUnicode_Dump() */ | |
847 if (!PyUnicode_IS_COMPACT(unicode)) | |
848 { | |
849 if (!PyUnicode_IS_READY(unicode)) | |
850 return "wstr"; | |
851 switch(PyUnicode_KIND(unicode)) | |
852 { | |
853 case PyUnicode_1BYTE_KIND: | |
854 if (PyUnicode_IS_ASCII(unicode)) | |
855 return "legacy ascii"; | |
856 else | |
857 return "legacy latin1"; | |
858 case PyUnicode_2BYTE_KIND: | |
859 return "legacy UCS2"; | |
860 case PyUnicode_4BYTE_KIND: | |
861 return "legacy UCS4"; | |
862 default: | |
863 return "<legacy invalid kind>"; | |
864 } | |
865 } | |
866 assert(PyUnicode_IS_READY(unicode)); | |
867 switch(PyUnicode_KIND(unicode)) | |
868 { | |
869 case PyUnicode_1BYTE_KIND: | |
870 if (PyUnicode_IS_ASCII(unicode)) | |
871 return "ascii"; | |
872 else | |
873 return "latin1"; | |
874 case PyUnicode_2BYTE_KIND: | |
875 return "UCS2"; | |
876 case PyUnicode_4BYTE_KIND: | |
877 return "UCS4"; | |
878 default: | |
879 return "<invalid compact kind>"; | |
880 } | |
881 } | |
882 | |
883 #ifdef Py_DEBUG | |
884 static int unicode_new_new_calls = 0; | |
885 | |
886 /* Functions wrapping macros for use in debugger */ | |
887 char *_PyUnicode_utf8(void *unicode){ | |
888 return PyUnicode_UTF8(unicode); | |
889 } | |
890 | |
891 void *_PyUnicode_compact_data(void *unicode) { | |
892 return _PyUnicode_COMPACT_DATA(unicode); | |
893 } | |
894 void *_PyUnicode_data(void *unicode){ | |
895 printf("obj %p\n", unicode); | |
896 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); | |
897 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); | |
898 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); | |
899 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); | |
900 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); | |
901 return PyUnicode_DATA(unicode); | |
902 } | |
903 | |
904 void | |
905 _PyUnicode_Dump(PyObject *op) | |
906 { | |
907 PyASCIIObject *ascii = (PyASCIIObject *)op; | |
908 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; | |
909 PyUnicodeObject *unicode = (PyUnicodeObject *)op; | |
910 void *data; | |
911 | |
912 if (ascii->state.compact) | |
913 { | |
914 if (ascii->state.ascii) | |
915 data = (ascii + 1); | |
916 else | |
917 data = (compact + 1); | |
918 } | |
919 else | |
920 data = unicode->data.any; | |
921 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); | |
922 | |
923 if (ascii->wstr == data) | |
924 printf("shared "); | |
925 printf("wstr=%p", ascii->wstr); | |
926 | |
927 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { | |
928 printf(" (%zu), ", compact->wstr_length); | |
929 if (!ascii->state.compact && compact->utf8 == unicode->data.any) | |
930 printf("shared "); | |
931 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); | |
932 } | |
933 printf(", data=%p\n", data); | |
934 } | |
935 #endif | |
936 | |
937 PyObject * | |
938 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) | |
939 { | |
940 PyObject *obj; | |
941 PyCompactUnicodeObject *unicode; | |
942 void *data; | |
943 int kind_state; | |
944 int is_sharing, is_ascii; | |
945 Py_ssize_t char_size; | |
946 Py_ssize_t struct_size; | |
947 | |
948 /* Optimization for empty strings */ | |
949 if (size == 0 && unicode_empty != NULL) { | |
950 Py_INCREF(unicode_empty); | |
951 return unicode_empty; | |
952 } | |
953 | |
954 #ifdef Py_DEBUG | |
955 ++unicode_new_new_calls; | |
956 #endif | |
957 | |
958 is_ascii = 0; | |
959 is_sharing = 0; | |
960 struct_size = sizeof(PyCompactUnicodeObject); | |
961 if (maxchar < 128) { | |
962 kind_state = PyUnicode_1BYTE_KIND; | |
963 char_size = 1; | |
964 is_ascii = 1; | |
965 struct_size = sizeof(PyASCIIObject); | |
966 } | |
967 else if (maxchar < 256) { | |
968 kind_state = PyUnicode_1BYTE_KIND; | |
969 char_size = 1; | |
970 } | |
971 else if (maxchar < 65536) { | |
972 kind_state = PyUnicode_2BYTE_KIND; | |
973 char_size = 2; | |
974 if (sizeof(wchar_t) == 2) | |
975 is_sharing = 1; | |
976 } | |
977 else { | |
978 kind_state = PyUnicode_4BYTE_KIND; | |
979 char_size = 4; | |
980 if (sizeof(wchar_t) == 4) | |
981 is_sharing = 1; | |
982 } | |
983 | |
984 /* Ensure we won't overflow the size. */ | |
985 if (size < 0) { | |
986 PyErr_SetString(PyExc_SystemError, | |
987 "Negative size passed to PyUnicode_New"); | |
988 return NULL; | |
989 } | |
990 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) | |
991 return PyErr_NoMemory(); | |
992 | |
993 /* Duplicated allocation code from _PyObject_New() instead of a call to | |
994 * PyObject_New() so we are able to allocate space for the object and | |
995 * it's data buffer. | |
996 */ | |
997 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); | |
998 if (obj == NULL) | |
999 return PyErr_NoMemory(); | |
1000 obj = PyObject_INIT(obj, &PyUnicode_Type); | |
1001 if (obj == NULL) | |
1002 return NULL; | |
1003 | |
1004 unicode = (PyCompactUnicodeObject *)obj; | |
1005 if (is_ascii) | |
1006 data = ((PyASCIIObject*)obj) + 1; | |
1007 else | |
1008 data = unicode + 1; | |
1009 _PyUnicode_LENGTH(unicode) = size; | |
1010 _PyUnicode_HASH(unicode) = -1; | |
1011 _PyUnicode_STATE(unicode).interned = 0; | |
1012 _PyUnicode_STATE(unicode).kind = kind_state; | |
1013 _PyUnicode_STATE(unicode).compact = 1; | |
1014 _PyUnicode_STATE(unicode).ready = 1; | |
1015 _PyUnicode_STATE(unicode).ascii = is_ascii; | |
1016 if (is_ascii) { | |
1017 ((char*)data)[size] = 0; | |
1018 _PyUnicode_WSTR(unicode) = NULL; | |
1019 } | |
1020 else if (kind_state == PyUnicode_1BYTE_KIND) { | |
1021 ((char*)data)[size] = 0; | |
1022 _PyUnicode_WSTR(unicode) = NULL; | |
1023 _PyUnicode_WSTR_LENGTH(unicode) = 0; | |
1024 unicode->utf8 = NULL; | |
1025 unicode->utf8_length = 0; | |
1026 } | |
1027 else { | |
1028 unicode->utf8 = NULL; | |
1029 unicode->utf8_length = 0; | |
1030 if (kind_state == PyUnicode_2BYTE_KIND) | |
1031 ((Py_UCS2*)data)[size] = 0; | |
1032 else /* kind_state == PyUnicode_4BYTE_KIND */ | |
1033 ((Py_UCS4*)data)[size] = 0; | |
1034 if (is_sharing) { | |
1035 _PyUnicode_WSTR_LENGTH(unicode) = size; | |
1036 _PyUnicode_WSTR(unicode) = (wchar_t *)data; | |
1037 } | |
1038 else { | |
1039 _PyUnicode_WSTR_LENGTH(unicode) = 0; | |
1040 _PyUnicode_WSTR(unicode) = NULL; | |
1041 } | |
1042 } | |
1043 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); | |
1044 return obj; | |
1045 } | |
1046 | |
1047 static int | |
1048 _PyUnicode_Dirty(PyObject *unicode) | |
1049 { | |
1050 assert(_PyUnicode_CHECK(unicode)); | |
1051 if (Py_REFCNT(unicode) != 1) { | |
1052 PyErr_SetString(PyExc_SystemError, | |
1053 "Cannot modify a string having more than 1 reference"); | |
1054 return -1; | |
1055 } | |
1056 _PyUnicode_DIRTY(unicode); | |
1057 return 0; | |
1058 } | |
1059 | |
1060 static int | |
1061 _copy_characters(PyObject *to, Py_ssize_t to_start, | |
1062 PyObject *from, Py_ssize_t from_start, | |
1063 Py_ssize_t how_many, int check_maxchar) | |
1064 { | |
1065 unsigned int from_kind, to_kind; | |
1066 void *from_data, *to_data; | |
1067 int fast; | |
1068 | |
1069 assert(PyUnicode_Check(from)); | |
1070 assert(PyUnicode_Check(to)); | |
1071 assert(PyUnicode_IS_READY(from)); | |
1072 assert(PyUnicode_IS_READY(to)); | |
1073 | |
1074 assert(PyUnicode_GET_LENGTH(from) >= how_many); | |
1075 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); | |
1076 assert(0 <= how_many); | |
1077 | |
1078 if (how_many == 0) | |
1079 return 0; | |
1080 | |
1081 from_kind = PyUnicode_KIND(from); | |
1082 from_data = PyUnicode_DATA(from); | |
1083 to_kind = PyUnicode_KIND(to); | |
1084 to_data = PyUnicode_DATA(to); | |
1085 | |
1086 #ifdef Py_DEBUG | |
1087 if (!check_maxchar | |
1088 && (from_kind > to_kind | |
1089 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))) | |
1090 { | |
1091 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); | |
1092 Py_UCS4 ch; | |
1093 Py_ssize_t i; | |
1094 for (i=0; i < how_many; i++) { | |
1095 ch = PyUnicode_READ(from_kind, from_data, from_start + i); | |
1096 assert(ch <= to_maxchar); | |
1097 } | |
1098 } | |
1099 #endif | |
1100 fast = (from_kind == to_kind); | |
1101 if (check_maxchar | |
1102 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) | |
1103 { | |
1104 /* deny latin1 => ascii */ | |
1105 fast = 0; | |
1106 } | |
1107 | |
1108 if (fast) { | |
1109 Py_MEMCPY((char*)to_data + to_kind * to_start, | |
1110 (char*)from_data + from_kind * from_start, | |
1111 to_kind * how_many); | |
1112 } | |
1113 else if (from_kind == PyUnicode_1BYTE_KIND | |
1114 && to_kind == PyUnicode_2BYTE_KIND) | |
1115 { | |
1116 _PyUnicode_CONVERT_BYTES( | |
1117 Py_UCS1, Py_UCS2, | |
1118 PyUnicode_1BYTE_DATA(from) + from_start, | |
1119 PyUnicode_1BYTE_DATA(from) + from_start + how_many, | |
1120 PyUnicode_2BYTE_DATA(to) + to_start | |
1121 ); | |
1122 } | |
1123 else if (from_kind == PyUnicode_1BYTE_KIND | |
1124 && to_kind == PyUnicode_4BYTE_KIND) | |
1125 { | |
1126 _PyUnicode_CONVERT_BYTES( | |
1127 Py_UCS1, Py_UCS4, | |
1128 PyUnicode_1BYTE_DATA(from) + from_start, | |
1129 PyUnicode_1BYTE_DATA(from) + from_start + how_many, | |
1130 PyUnicode_4BYTE_DATA(to) + to_start | |
1131 ); | |
1132 } | |
1133 else if (from_kind == PyUnicode_2BYTE_KIND | |
1134 && to_kind == PyUnicode_4BYTE_KIND) | |
1135 { | |
1136 _PyUnicode_CONVERT_BYTES( | |
1137 Py_UCS2, Py_UCS4, | |
1138 PyUnicode_2BYTE_DATA(from) + from_start, | |
1139 PyUnicode_2BYTE_DATA(from) + from_start + how_many, | |
1140 PyUnicode_4BYTE_DATA(to) + to_start | |
1141 ); | |
1142 } | |
1143 else { | |
1144 /* check if max_char(from substring) <= max_char(to) */ | |
1145 if (from_kind > to_kind | |
1146 /* latin1 => ascii */ | |
1147 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) | |
1148 { | |
1149 /* slow path to check for character overflow */ | |
1150 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); | |
1151 Py_UCS4 ch; | |
1152 Py_ssize_t i; | |
1153 | |
1154 #ifdef Py_DEBUG | |
1155 for (i=0; i < how_many; i++) { | |
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i); | |
1157 assert(ch <= to_maxchar); | |
1158 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); | |
1159 } | |
1160 #else | |
1161 if (!check_maxchar) { | |
1162 for (i=0; i < how_many; i++) { | |
1163 ch = PyUnicode_READ(from_kind, from_data, from_start + i); | |
1164 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); | |
1165 } | |
1166 } | |
1167 else { | |
1168 for (i=0; i < how_many; i++) { | |
1169 ch = PyUnicode_READ(from_kind, from_data, from_start + i); | |
1170 if (ch > to_maxchar) | |
1171 return 1; | |
1172 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); | |
1173 } | |
1174 } | |
1175 #endif | |
1176 } | |
1177 else { | |
1178 assert(0 && "inconsistent state"); | |
1179 return 1; | |
1180 } | |
1181 } | |
1182 return 0; | |
1183 } | |
1184 | |
1185 static void | |
1186 copy_characters(PyObject *to, Py_ssize_t to_start, | |
1187 PyObject *from, Py_ssize_t from_start, | |
1188 Py_ssize_t how_many) | |
1189 { | |
1190 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); | |
1191 } | |
1192 | |
1193 Py_ssize_t | |
1194 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, | |
1195 PyObject *from, Py_ssize_t from_start, | |
1196 Py_ssize_t how_many) | |
1197 { | |
1198 int err; | |
1199 | |
1200 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { | |
1201 PyErr_BadInternalCall(); | |
1202 return -1; | |
1203 } | |
1204 | |
1205 if (PyUnicode_READY(from)) | |
1206 return -1; | |
1207 if (PyUnicode_READY(to)) | |
1208 return -1; | |
1209 | |
1210 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); | |
1211 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { | |
1212 PyErr_Format(PyExc_SystemError, | |
1213 "Cannot write %zi characters at %zi " | |
1214 "in a string of %zi characters", | |
1215 how_many, to_start, PyUnicode_GET_LENGTH(to)); | |
1216 return -1; | |
1217 } | |
1218 | |
1219 if (how_many == 0) | |
1220 return 0; | |
1221 | |
1222 if (_PyUnicode_Dirty(to)) | |
1223 return -1; | |
1224 | |
1225 err = _copy_characters(to, to_start, from, from_start, how_many, 1); | |
1226 if (err) { | |
1227 PyErr_Format(PyExc_SystemError, | |
1228 "Cannot copy %s characters " | |
1229 "into a string of %s characters", | |
1230 unicode_kind_name(from), | |
1231 unicode_kind_name(to)); | |
1232 return -1; | |
1233 } | |
1234 return how_many; | |
1235 } | |
1236 | |
1237 /* Find the maximum code point and count the number of surrogate pairs so a | |
1238 correct string length can be computed before converting a string to UCS4. | |
1239 This function counts single surrogates as a character and not as a pair. | |
1240 | |
1241 Return 0 on success, or -1 on error. */ | |
1242 static int | |
1243 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, | |
1244 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) | |
1245 { | |
1246 const wchar_t *iter; | |
1247 Py_UCS4 ch; | |
1248 | |
1249 assert(num_surrogates != NULL && maxchar != NULL); | |
1250 *num_surrogates = 0; | |
1251 *maxchar = 0; | |
1252 | |
1253 for (iter = begin; iter < end; ) { | |
1254 { | |
1255 ch = *iter; | |
1256 iter++; | |
1257 } | |
1258 if (ch > *maxchar) { | |
1259 *maxchar = ch; | |
1260 if (*maxchar > MAX_UNICODE) { | |
1261 PyErr_Format(PyExc_ValueError, | |
1262 "character U+%x is not in range [U+0000; U+10ffff]", | |
1263 ch); | |
1264 return -1; | |
1265 } | |
1266 } | |
1267 } | |
1268 return 0; | |
1269 } | |
1270 | |
1271 int | |
1272 _PyUnicode_Ready(PyObject *unicode) | |
1273 { | |
1274 wchar_t *end; | |
1275 Py_UCS4 maxchar = 0; | |
1276 Py_ssize_t num_surrogates; | |
1277 | |
1278 /* _PyUnicode_Ready() is only intended for old-style API usage where | |
1279 strings were created using _PyObject_New() and where no canonical | |
1280 representation (the str field) has been set yet aka strings | |
1281 which are not yet ready. */ | |
1282 assert(_PyUnicode_CHECK(unicode)); | |
1283 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); | |
1284 assert(_PyUnicode_WSTR(unicode) != NULL); | |
1285 assert(_PyUnicode_DATA_ANY(unicode) == NULL); | |
1286 assert(_PyUnicode_UTF8(unicode) == NULL); | |
1287 /* Actually, it should neither be interned nor be anything else: */ | |
1288 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); | |
1289 | |
1290 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); | |
1291 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, | |
1292 &maxchar, &num_surrogates) == -1) | |
1293 return -1; | |
1294 | |
1295 if (maxchar < 256) { | |
1296 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); | |
1297 if (!_PyUnicode_DATA_ANY(unicode)) { | |
1298 PyErr_NoMemory(); | |
1299 return -1; | |
1300 } | |
1301 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, | |
1302 _PyUnicode_WSTR(unicode), end, | |
1303 PyUnicode_1BYTE_DATA(unicode)); | |
1304 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; | |
1305 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); | |
1306 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; | |
1307 if (maxchar < 128) { | |
1308 _PyUnicode_STATE(unicode).ascii = 1; | |
1309 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); | |
1310 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); | |
1311 } | |
1312 else { | |
1313 _PyUnicode_STATE(unicode).ascii = 0; | |
1314 _PyUnicode_UTF8(unicode) = NULL; | |
1315 _PyUnicode_UTF8_LENGTH(unicode) = 0; | |
1316 } | |
1317 PyObject_FREE(_PyUnicode_WSTR(unicode)); | |
1318 _PyUnicode_WSTR(unicode) = NULL; | |
1319 _PyUnicode_WSTR_LENGTH(unicode) = 0; | |
1320 } | |
1321 /* In this case we might have to convert down from 4-byte native | |
1322 wchar_t to 2-byte unicode. */ | |
1323 else if (maxchar < 65536) { | |
1324 assert(num_surrogates == 0 && | |
1325 "FindMaxCharAndNumSurrogatePairs() messed up"); | |
1326 | |
1327 /* sizeof(wchar_t) == 4 */ | |
1328 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( | |
1329 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); | |
1330 if (!_PyUnicode_DATA_ANY(unicode)) { | |
1331 PyErr_NoMemory(); | |
1332 return -1; | |
1333 } | |
1334 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, | |
1335 _PyUnicode_WSTR(unicode), end, | |
1336 PyUnicode_2BYTE_DATA(unicode)); | |
1337 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; | |
1338 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); | |
1339 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; | |
1340 _PyUnicode_UTF8(unicode) = NULL; | |
1341 _PyUnicode_UTF8_LENGTH(unicode) = 0; | |
1342 PyObject_FREE(_PyUnicode_WSTR(unicode)); | |
1343 _PyUnicode_WSTR(unicode) = NULL; | |
1344 _PyUnicode_WSTR_LENGTH(unicode) = 0; | |
1345 } | |
1346 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ | |
1347 else { | |
1348 assert(num_surrogates == 0); | |
1349 | |
1350 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); | |
1351 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); | |
1352 _PyUnicode_UTF8(unicode) = NULL; | |
1353 _PyUnicode_UTF8_LENGTH(unicode) = 0; | |
1354 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; | |
1355 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; | |
1356 } | |
1357 _PyUnicode_STATE(unicode).ready = 1; | |
1358 assert(_PyUnicode_CheckConsistency(unicode, 1)); | |
1359 return 0; | |
1360 } | |
1361 | |
1362 static void | |
1363 unicode_dealloc(register PyObject *unicode) | |
1364 { | |
1365 switch (PyUnicode_CHECK_INTERNED(unicode)) { | |
1366 case SSTATE_NOT_INTERNED: | |
1367 break; | |
1368 | |
1369 case SSTATE_INTERNED_MORTAL: | |
1370 /* revive dead object temporarily for DelItem */ | |
1371 Py_REFCNT(unicode) = 3; | |
1372 if (PyDict_DelItem(interned, unicode) != 0) | |
1373 Py_FatalError( | |
1374 "deletion of interned string failed"); | |
1375 break; | |
1376 | |
1377 case SSTATE_INTERNED_IMMORTAL: | |
1378 Py_FatalError("Immortal interned string died."); | |
1379 | |
1380 default: | |
1381 Py_FatalError("Inconsistent interned string state."); | |
1382 } | |
1383 | |
1384 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) | |
1385 PyObject_DEL(_PyUnicode_WSTR(unicode)); | |
1386 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) | |
1387 PyObject_DEL(_PyUnicode_UTF8(unicode)); | |
1388 | |
1389 if (PyUnicode_IS_COMPACT(unicode)) { | |
1390 Py_TYPE(unicode)->tp_free(unicode); | |
1391 } | |
1392 else { | |
1393 if (_PyUnicode_DATA_ANY(unicode)) | |
1394 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); | |
1395 Py_TYPE(unicode)->tp_free(unicode); | |
1396 } | |
1397 } | |
1398 | |
1399 #ifdef Py_DEBUG | |
1400 static int | |
1401 unicode_is_singleton(PyObject *unicode) | |
1402 { | |
1403 PyASCIIObject *ascii = (PyASCIIObject *)unicode; | |
1404 if (unicode == unicode_empty) | |
1405 return 1; | |
1406 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) | |
1407 { | |
1408 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); | |
1409 if (ch < 256 && unicode_latin1[ch] == unicode) | |
1410 return 1; | |
1411 } | |
1412 return 0; | |
1413 } | |
1414 #endif | |
1415 | |
1416 static int | |
1417 unicode_resizable(PyObject *unicode) | |
1418 { | |
1419 if (Py_REFCNT(unicode) != 1) | |
1420 return 0; | |
1421 if (PyUnicode_CHECK_INTERNED(unicode)) | |
1422 return 0; | |
1423 #ifdef Py_DEBUG | |
1424 /* singleton refcount is greater than 1 */ | |
1425 assert(!unicode_is_singleton(unicode)); | |
1426 #endif | |
1427 return 1; | |
1428 } | |
1429 | |
1430 static int | |
1431 unicode_resize(PyObject **p_unicode, Py_ssize_t length) | |
1432 { | |
1433 PyObject *unicode; | |
1434 Py_ssize_t old_length; | |
1435 | |
1436 assert(p_unicode != NULL); | |
1437 unicode = *p_unicode; | |
1438 | |
1439 assert(unicode != NULL); | |
1440 assert(PyUnicode_Check(unicode)); | |
1441 assert(0 <= length); | |
1442 | |
1443 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) | |
1444 old_length = PyUnicode_WSTR_LENGTH(unicode); | |
1445 else | |
1446 old_length = PyUnicode_GET_LENGTH(unicode); | |
1447 if (old_length == length) | |
1448 return 0; | |
1449 | |
1450 if (length == 0) { | |
1451 Py_DECREF(*p_unicode); | |
1452 *p_unicode = unicode_empty; | |
1453 Py_INCREF(*p_unicode); | |
1454 return 0; | |
1455 } | |
1456 | |
1457 if (!unicode_resizable(unicode)) { | |
1458 PyObject *copy = resize_copy(unicode, length); | |
1459 if (copy == NULL) | |
1460 return -1; | |
1461 Py_DECREF(*p_unicode); | |
1462 *p_unicode = copy; | |
1463 return 0; | |
1464 } | |
1465 | |
1466 if (PyUnicode_IS_COMPACT(unicode)) { | |
1467 *p_unicode = resize_compact(unicode, length); | |
1468 if (*p_unicode == NULL) | |
1469 return -1; | |
1470 assert(_PyUnicode_CheckConsistency(*p_unicode, 0)); | |
1471 return 0; | |
1472 } | |
1473 return resize_inplace(unicode, length); | |
1474 } | |
1475 | |
1476 int | |
1477 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) | |
1478 { | |
1479 PyObject *unicode; | |
1480 if (p_unicode == NULL) { | |
1481 PyErr_BadInternalCall(); | |
1482 return -1; | |
1483 } | |
1484 unicode = *p_unicode; | |
1485 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) | |
1486 { | |
1487 PyErr_BadInternalCall(); | |
1488 return -1; | |
1489 } | |
1490 return unicode_resize(p_unicode, length); | |
1491 } | |
1492 | |
1493 static int | |
1494 unicode_widen(PyObject **p_unicode, unsigned int maxchar) | |
1495 { | |
1496 PyObject *result; | |
1497 assert(PyUnicode_IS_READY(*p_unicode)); | |
1498 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode)) | |
1499 return 0; | |
1500 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode), | |
1501 maxchar); | |
1502 if (result == NULL) | |
1503 return -1; | |
1504 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, | |
1505 PyUnicode_GET_LENGTH(*p_unicode)); | |
1506 Py_DECREF(*p_unicode); | |
1507 *p_unicode = result; | |
1508 return 0; | |
1509 } | |
1510 | |
1511 static int | |
1512 unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos, | |
1513 Py_UCS4 ch) | |
1514 { | |
1515 if (unicode_widen(p_unicode, ch) < 0) | |
1516 return -1; | |
1517 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode), | |
1518 PyUnicode_DATA(*p_unicode), | |
1519 (*pos)++, ch); | |
1520 return 0; | |
1521 } | |
1522 | |
1523 static PyObject* | |
1524 get_latin1_char(unsigned char ch) | |
1525 { | |
1526 PyObject *unicode = unicode_latin1[ch]; | |
1527 if (!unicode) { | |
1528 unicode = PyUnicode_New(1, ch); | |
1529 if (!unicode) | |
1530 return NULL; | |
1531 PyUnicode_1BYTE_DATA(unicode)[0] = ch; | |
1532 assert(_PyUnicode_CheckConsistency(unicode, 1)); | |
1533 unicode_latin1[ch] = unicode; | |
1534 } | |
1535 Py_INCREF(unicode); | |
1536 return unicode; | |
1537 } | |
1538 | |
1539 PyObject * | |
1540 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) | |
1541 { | |
1542 PyObject *unicode; | |
1543 Py_UCS4 maxchar = 0; | |
1544 Py_ssize_t num_surrogates; | |
1545 | |
1546 if (u == NULL) | |
1547 return (PyObject*)_PyUnicode_New(size); | |
1548 | |
1549 /* If the Unicode data is known at construction time, we can apply | |
1550 some optimizations which share commonly used objects. */ | |
1551 | |
1552 /* Optimization for empty strings */ | |
1553 if (size == 0 && unicode_empty != NULL) { | |
1554 Py_INCREF(unicode_empty); | |
1555 return unicode_empty; | |
1556 } | |
1557 | |
1558 /* Single character Unicode objects in the Latin-1 range are | |
1559 shared when using this constructor */ | |
1560 if (size == 1 && *u < 256) | |
1561 return get_latin1_char((unsigned char)*u); | |
1562 | |
1563 /* If not empty and not single character, copy the Unicode data | |
1564 into the new object */ | |
1565 if (find_maxchar_surrogates(u, u + size, | |
1566 &maxchar, &num_surrogates) == -1) | |
1567 return NULL; | |
1568 | |
1569 unicode = PyUnicode_New(size - num_surrogates, maxchar); | |
1570 if (!unicode) | |
1571 return NULL; | |
1572 | |
1573 switch (PyUnicode_KIND(unicode)) { | |
1574 case PyUnicode_1BYTE_KIND: | |
1575 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, | |
1576 u, u + size, PyUnicode_1BYTE_DATA(unicode)); | |
1577 break; | |
1578 case PyUnicode_2BYTE_KIND: | |
1579 #if Py_UNICODE_SIZE == 2 | |
1580 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); | |
1581 #else | |
1582 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, | |
1583 u, u + size, PyUnicode_2BYTE_DATA(unicode)); | |
1584 #endif | |
1585 break; | |
1586 case PyUnicode_4BYTE_KIND: | |
1587 #if SIZEOF_WCHAR_T == 2 | |
1588 /* This is the only case which has to process surrogates, thus | |
1589 a simple copy loop is not enough and we need a function. */ | |
1590 unicode_convert_wchar_to_ucs4(u, u + size, unicode); | |
1591 #else | |
1592 assert(num_surrogates == 0); | |
1593 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); | |
1594 #endif | |
1595 break; | |
1596 default: | |
1597 assert(0 && "Impossible state"); | |
1598 } | |
1599 | |
1600 return unicode_result(unicode); | |
1601 } | |
1602 | |
1603 PyObject * | |
1604 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) | |
1605 { | |
1606 if (size < 0) { | |
1607 PyErr_SetString(PyExc_SystemError, | |
1608 "Negative size passed to PyUnicode_FromStringAndSize"); | |
1609 return NULL; | |
1610 } | |
1611 | |
1612 /* If the Unicode data is known at construction time, we can apply | |
1613 some optimizations which share commonly used objects. | |
1614 Also, this means the input must be UTF-8, so fall back to the | |
1615 UTF-8 decoder at the end. */ | |
1616 if (u != NULL) { | |
1617 | |
1618 /* Optimization for empty strings */ | |
1619 if (size == 0 && unicode_empty != NULL) { | |
1620 Py_INCREF(unicode_empty); | |
1621 return unicode_empty; | |
1622 } | |
1623 | |
1624 /* Single characters are shared when using this constructor. | |
1625 Restrict to ASCII, since the input must be UTF-8. */ | |
1626 if (size == 1 && (unsigned char)*u < 128) | |
1627 return get_latin1_char((unsigned char)*u); | |
1628 | |
1629 return PyUnicode_DecodeUTF8(u, size, NULL); | |
1630 } | |
1631 | |
1632 return (PyObject *)_PyUnicode_New(size); | |
1633 } | |
1634 | |
1635 PyObject * | |
1636 PyUnicode_FromString(const char *u) | |
1637 { | |
1638 size_t size = strlen(u); | |
1639 if (size > PY_SSIZE_T_MAX) { | |
1640 PyErr_SetString(PyExc_OverflowError, "input too long"); | |
1641 return NULL; | |
1642 } | |
1643 | |
1644 return PyUnicode_FromStringAndSize(u, size); | |
1645 } | |
1646 | |
1647 PyObject * | |
1648 _PyUnicode_FromId(_Py_Identifier *id) | |
1649 { | |
1650 if (!id->object) { | |
1651 id->object = PyUnicode_FromString(id->string); | |
1652 if (!id->object) | |
1653 return NULL; | |
1654 PyUnicode_InternInPlace(&id->object); | |
1655 assert(!id->next); | |
1656 id->next = static_strings; | |
1657 static_strings = id; | |
1658 } | |
1659 return id->object; | |
1660 } | |
1661 | |
1662 void | |
1663 _PyUnicode_ClearStaticStrings() | |
1664 { | |
1665 _Py_Identifier *i; | |
1666 for (i = static_strings; i; i = i->next) { | |
1667 Py_DECREF(i->object); | |
1668 i->object = NULL; | |
1669 i->next = NULL; | |
1670 } | |
1671 } | |
1672 | |
1673 /* Internal function, don't check maximum character */ | |
1674 | |
1675 static PyObject* | |
1676 unicode_fromascii(const unsigned char* s, Py_ssize_t size) | |
1677 { | |
1678 PyObject *res; | |
1679 #ifdef Py_DEBUG | |
1680 const unsigned char *p; | |
1681 const unsigned char *end = s + size; | |
1682 for (p=s; p < end; p++) { | |
1683 assert(*p < 128); | |
1684 } | |
1685 #endif | |
1686 if (size == 1) | |
1687 return get_latin1_char(s[0]); | |
1688 res = PyUnicode_New(size, 127); | |
1689 if (!res) | |
1690 return NULL; | |
1691 memcpy(PyUnicode_1BYTE_DATA(res), s, size); | |
1692 return res; | |
1693 } | |
1694 | |
1695 static Py_UCS4 | |
1696 kind_maxchar_limit(unsigned int kind) | |
1697 { | |
1698 switch(kind) { | |
1699 case PyUnicode_1BYTE_KIND: | |
1700 return 0x80; | |
1701 case PyUnicode_2BYTE_KIND: | |
1702 return 0x100; | |
1703 case PyUnicode_4BYTE_KIND: | |
1704 return 0x10000; | |
1705 default: | |
1706 assert(0 && "invalid kind"); | |
1707 return MAX_UNICODE; | |
1708 } | |
1709 } | |
1710 | |
1711 static PyObject* | |
1712 _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) | |
1713 { | |
1714 PyObject *res; | |
1715 unsigned char max_char; | |
1716 | |
1717 if (size == 0) { | |
1718 Py_INCREF(unicode_empty); | |
1719 return unicode_empty; | |
1720 } | |
1721 assert(size > 0); | |
1722 if (size == 1) | |
1723 return get_latin1_char(u[0]); | |
1724 | |
1725 max_char = ucs1lib_find_max_char(u, u + size); | |
1726 res = PyUnicode_New(size, max_char); | |
1727 if (!res) | |
1728 return NULL; | |
1729 memcpy(PyUnicode_1BYTE_DATA(res), u, size); | |
1730 assert(_PyUnicode_CheckConsistency(res, 1)); | |
1731 return res; | |
1732 } | |
1733 | |
1734 static PyObject* | |
1735 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) | |
1736 { | |
1737 PyObject *res; | |
1738 Py_UCS2 max_char; | |
1739 | |
1740 if (size == 0) { | |
1741 Py_INCREF(unicode_empty); | |
1742 return unicode_empty; | |
1743 } | |
1744 assert(size > 0); | |
1745 if (size == 1 && u[0] < 256) | |
1746 return get_latin1_char((unsigned char)u[0]); | |
1747 | |
1748 max_char = ucs2lib_find_max_char(u, u + size); | |
1749 res = PyUnicode_New(size, max_char); | |
1750 if (!res) | |
1751 return NULL; | |
1752 if (max_char >= 256) | |
1753 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); | |
1754 else { | |
1755 _PyUnicode_CONVERT_BYTES( | |
1756 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); | |
1757 } | |
1758 assert(_PyUnicode_CheckConsistency(res, 1)); | |
1759 return res; | |
1760 } | |
1761 | |
1762 static PyObject* | |
1763 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) | |
1764 { | |
1765 PyObject *res; | |
1766 Py_UCS4 max_char; | |
1767 | |
1768 if (size == 0) { | |
1769 Py_INCREF(unicode_empty); | |
1770 return unicode_empty; | |
1771 } | |
1772 assert(size > 0); | |
1773 if (size == 1 && u[0] < 256) | |
1774 return get_latin1_char((unsigned char)u[0]); | |
1775 | |
1776 max_char = ucs4lib_find_max_char(u, u + size); | |
1777 res = PyUnicode_New(size, max_char); | |
1778 if (!res) | |
1779 return NULL; | |
1780 if (max_char < 256) | |
1781 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, | |
1782 PyUnicode_1BYTE_DATA(res)); | |
1783 else if (max_char < 0x10000) | |
1784 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, | |
1785 PyUnicode_2BYTE_DATA(res)); | |
1786 else | |
1787 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); | |
1788 assert(_PyUnicode_CheckConsistency(res, 1)); | |
1789 return res; | |
1790 } | |
1791 | |
1792 PyObject* | |
1793 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) | |
1794 { | |
1795 if (size < 0) { | |
1796 PyErr_SetString(PyExc_ValueError, "size must be positive"); | |
1797 return NULL; | |
1798 } | |
1799 switch(kind) { | |
1800 case PyUnicode_1BYTE_KIND: | |
1801 return _PyUnicode_FromUCS1(buffer, size); | |
1802 case PyUnicode_2BYTE_KIND: | |
1803 return _PyUnicode_FromUCS2(buffer, size); | |
1804 case PyUnicode_4BYTE_KIND: | |
1805 return _PyUnicode_FromUCS4(buffer, size); | |
1806 default: | |
1807 PyErr_SetString(PyExc_SystemError, "invalid kind"); | |
1808 return NULL; | |
1809 } | |
1810 } | |
1811 | |
1812 /* Ensure that a string uses the most efficient storage, if it is not the | |
1813 case: create a new string with of the right kind. Write NULL into *p_unicode | |
1814 on error. */ | |
1815 static void | |
1816 unicode_adjust_maxchar(PyObject **p_unicode) | |
1817 { | |
1818 PyObject *unicode, *copy; | |
1819 Py_UCS4 max_char; | |
1820 Py_ssize_t len; | |
1821 unsigned int kind; | |
1822 | |
1823 assert(p_unicode != NULL); | |
1824 unicode = *p_unicode; | |
1825 assert(PyUnicode_IS_READY(unicode)); | |
1826 if (PyUnicode_IS_ASCII(unicode)) | |
1827 return; | |
1828 | |
1829 len = PyUnicode_GET_LENGTH(unicode); | |
1830 kind = PyUnicode_KIND(unicode); | |
1831 if (kind == PyUnicode_1BYTE_KIND) { | |
1832 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); | |
1833 max_char = ucs1lib_find_max_char(u, u + len); | |
1834 if (max_char >= 128) | |
1835 return; | |
1836 } | |
1837 else if (kind == PyUnicode_2BYTE_KIND) { | |
1838 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); | |
1839 max_char = ucs2lib_find_max_char(u, u + len); | |
1840 if (max_char >= 256) | |
1841 return; | |
1842 } | |
1843 else { | |
1844 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); | |
1845 assert(kind == PyUnicode_4BYTE_KIND); | |
1846 max_char = ucs4lib_find_max_char(u, u + len); | |
1847 if (max_char >= 0x10000) | |
1848 return; | |
1849 } | |
1850 copy = PyUnicode_New(len, max_char); | |
1851 copy_characters(copy, 0, unicode, 0, len); | |
1852 Py_DECREF(unicode); | |
1853 *p_unicode = copy; | |
1854 } | |
1855 | |
1856 PyObject* | |
1857 PyUnicode_Copy(PyObject *unicode) | |
1858 { | |
1859 Py_ssize_t length; | |
1860 PyObject *copy; | |
1861 | |
1862 if (!PyUnicode_Check(unicode)) { | |
1863 PyErr_BadInternalCall(); | |
1864 return NULL; | |
1865 } | |
1866 if (PyUnicode_READY(unicode)) | |
1867 return NULL; | |
1868 | |
1869 length = PyUnicode_GET_LENGTH(unicode); | |
1870 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); | |
1871 if (!copy) | |
1872 return NULL; | |
1873 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); | |
1874 | |
1875 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), | |
1876 length * PyUnicode_KIND(unicode)); | |
1877 assert(_PyUnicode_CheckConsistency(copy, 1)); | |
1878 return copy; | |
1879 } | |
1880 | |
1881 | |
1882 /* Widen Unicode objects to larger buffers. Don't write terminating null | |
1883 character. Return NULL on error. */ | |
1884 | |
1885 void* | |
1886 _PyUnicode_AsKind(PyObject *s, unsigned int kind) | |
1887 { | |
1888 Py_ssize_t len; | |
1889 void *result; | |
1890 unsigned int skind; | |
1891 | |
1892 if (PyUnicode_READY(s)) | |
1893 return NULL; | |
1894 | |
1895 len = PyUnicode_GET_LENGTH(s); | |
1896 skind = PyUnicode_KIND(s); | |
1897 if (skind >= kind) { | |
1898 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); | |
1899 return NULL; | |
1900 } | |
1901 switch(kind) { | |
1902 case PyUnicode_2BYTE_KIND: | |
1903 result = PyMem_Malloc(len * sizeof(Py_UCS2)); | |
1904 if (!result) | |
1905 return PyErr_NoMemory(); | |
1906 assert(skind == PyUnicode_1BYTE_KIND); | |
1907 _PyUnicode_CONVERT_BYTES( | |
1908 Py_UCS1, Py_UCS2, | |
1909 PyUnicode_1BYTE_DATA(s), | |
1910 PyUnicode_1BYTE_DATA(s) + len, | |
1911 result); | |
1912 return result; | |
1913 case PyUnicode_4BYTE_KIND: | |
1914 result = PyMem_Malloc(len * sizeof(Py_UCS4)); | |
1915 if (!result) | |
1916 return PyErr_NoMemory(); | |
1917 if (skind == PyUnicode_2BYTE_KIND) { | |
1918 _PyUnicode_CONVERT_BYTES( | |
1919 Py_UCS2, Py_UCS4, | |
1920 PyUnicode_2BYTE_DATA(s), | |
1921 PyUnicode_2BYTE_DATA(s) + len, | |
1922 result); | |
1923 } | |
1924 else { | |
1925 assert(skind == PyUnicode_1BYTE_KIND); | |
1926 _PyUnicode_CONVERT_BYTES( | |
1927 Py_UCS1, Py_UCS4, | |
1928 PyUnicode_1BYTE_DATA(s), | |
1929 PyUnicode_1BYTE_DATA(s) + len, | |
1930 result); | |
1931 } | |
1932 return result; | |
1933 default: | |
1934 break; | |
1935 } | |
1936 PyErr_SetString(PyExc_SystemError, "invalid kind"); | |
1937 return NULL; | |
1938 } | |
1939 | |
1940 static Py_UCS4* | |
1941 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, | |
1942 int copy_null) | |
1943 { | |
1944 int kind; | |
1945 void *data; | |
1946 Py_ssize_t len, targetlen; | |
1947 if (PyUnicode_READY(string) == -1) | |
1948 return NULL; | |
1949 kind = PyUnicode_KIND(string); | |
1950 data = PyUnicode_DATA(string); | |
1951 len = PyUnicode_GET_LENGTH(string); | |
1952 targetlen = len; | |
1953 if (copy_null) | |
1954 targetlen++; | |
1955 if (!target) { | |
1956 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { | |
1957 PyErr_NoMemory(); | |
1958 return NULL; | |
1959 } | |
1960 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); | |
1961 if (!target) { | |
1962 PyErr_NoMemory(); | |
1963 return NULL; | |
1964 } | |
1965 } | |
1966 else { | |
1967 if (targetsize < targetlen) { | |
1968 PyErr_Format(PyExc_SystemError, | |
1969 "string is longer than the buffer"); | |
1970 if (copy_null && 0 < targetsize) | |
1971 target[0] = 0; | |
1972 return NULL; | |
1973 } | |
1974 } | |
1975 if (kind == PyUnicode_1BYTE_KIND) { | |
1976 Py_UCS1 *start = (Py_UCS1 *) data; | |
1977 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); | |
1978 } | |
1979 else if (kind == PyUnicode_2BYTE_KIND) { | |
1980 Py_UCS2 *start = (Py_UCS2 *) data; | |
1981 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); | |
1982 } | |
1983 else { | |
1984 assert(kind == PyUnicode_4BYTE_KIND); | |
1985 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); | |
1986 } | |
1987 if (copy_null) | |
1988 target[len] = 0; | |
1989 return target; | |
1990 } | |
1991 | |
1992 Py_UCS4* | |
1993 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, | |
1994 int copy_null) | |
1995 { | |
1996 if (target == NULL || targetsize < 0) { | |
1997 PyErr_BadInternalCall(); | |
1998 return NULL; | |
1999 } | |
2000 return as_ucs4(string, target, targetsize, copy_null); | |
2001 } | |
2002 | |
2003 Py_UCS4* | |
2004 PyUnicode_AsUCS4Copy(PyObject *string) | |
2005 { | |
2006 return as_ucs4(string, NULL, 0, 1); | |
2007 } | |
2008 | |
2009 #ifdef HAVE_WCHAR_H | |
2010 | |
2011 PyObject * | |
2012 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) | |
2013 { | |
2014 if (w == NULL) { | |
2015 if (size == 0) | |
2016 return PyUnicode_New(0, 0); | |
2017 PyErr_BadInternalCall(); | |
2018 return NULL; | |
2019 } | |
2020 | |
2021 if (size == -1) { | |
2022 size = wcslen(w); | |
2023 } | |
2024 | |
2025 return PyUnicode_FromUnicode(w, size); | |
2026 } | |
2027 | |
2028 #endif /* HAVE_WCHAR_H */ | |
2029 | |
2030 static void | |
2031 makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, | |
2032 int zeropad, int width, int precision, char c) | |
2033 { | |
2034 *fmt++ = '%'; | |
2035 if (width) { | |
2036 if (zeropad) | |
2037 *fmt++ = '0'; | |
2038 fmt += sprintf(fmt, "%d", width); | |
2039 } | |
2040 if (precision) | |
2041 fmt += sprintf(fmt, ".%d", precision); | |
2042 if (longflag) | |
2043 *fmt++ = 'l'; | |
2044 else if (longlongflag) { | |
2045 /* longlongflag should only ever be nonzero on machines with | |
2046 HAVE_LONG_LONG defined */ | |
2047 #ifdef HAVE_LONG_LONG | |
2048 char *f = PY_FORMAT_LONG_LONG; | |
2049 while (*f) | |
2050 *fmt++ = *f++; | |
2051 #else | |
2052 /* we shouldn't ever get here */ | |
2053 assert(0); | |
2054 *fmt++ = 'l'; | |
2055 #endif | |
2056 } | |
2057 else if (size_tflag) { | |
2058 char *f = PY_FORMAT_SIZE_T; | |
2059 while (*f) | |
2060 *fmt++ = *f++; | |
2061 } | |
2062 *fmt++ = c; | |
2063 *fmt = '\0'; | |
2064 } | |
2065 | |
2066 /* helper for PyUnicode_FromFormatV() */ | |
2067 | |
2068 static const char* | |
2069 parse_format_flags(const char *f, | |
2070 int *p_width, int *p_precision, | |
2071 int *p_longflag, int *p_longlongflag, int *p_size_tflag) | |
2072 { | |
2073 int width, precision, longflag, longlongflag, size_tflag; | |
2074 | |
2075 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ | |
2076 f++; | |
2077 width = 0; | |
2078 while (Py_ISDIGIT((unsigned)*f)) | |
2079 width = (width*10) + *f++ - '0'; | |
2080 precision = 0; | |
2081 if (*f == '.') { | |
2082 f++; | |
2083 while (Py_ISDIGIT((unsigned)*f)) | |
2084 precision = (precision*10) + *f++ - '0'; | |
2085 if (*f == '%') { | |
2086 /* "%.3%s" => f points to "3" */ | |
2087 f--; | |
2088 } | |
2089 } | |
2090 if (*f == '\0') { | |
2091 /* bogus format "%.1" => go backward, f points to "1" */ | |
2092 f--; | |
2093 } | |
2094 if (p_width != NULL) | |
2095 *p_width = width; | |
2096 if (p_precision != NULL) | |
2097 *p_precision = precision; | |
2098 | |
2099 /* Handle %ld, %lu, %lld and %llu. */ | |
2100 longflag = 0; | |
2101 longlongflag = 0; | |
2102 size_tflag = 0; | |
2103 | |
2104 if (*f == 'l') { | |
2105 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { | |
2106 longflag = 1; | |
2107 ++f; | |
2108 } | |
2109 #ifdef HAVE_LONG_LONG | |
2110 else if (f[1] == 'l' && | |
2111 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { | |
2112 longlongflag = 1; | |
2113 f += 2; | |
2114 } | |
2115 #endif | |
2116 } | |
2117 /* handle the size_t flag. */ | |
2118 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { | |
2119 size_tflag = 1; | |
2120 ++f; | |
2121 } | |
2122 if (p_longflag != NULL) | |
2123 *p_longflag = longflag; | |
2124 if (p_longlongflag != NULL) | |
2125 *p_longlongflag = longlongflag; | |
2126 if (p_size_tflag != NULL) | |
2127 *p_size_tflag = size_tflag; | |
2128 return f; | |
2129 } | |
2130 | |
2131 /* maximum number of characters required for output of %ld. 21 characters | |
2132 allows for 64-bit integers (in decimal) and an optional sign. */ | |
2133 #define MAX_LONG_CHARS 21 | |
2134 /* maximum number of characters required for output of %lld. | |
2135 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, | |
2136 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ | |
2137 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) | |
2138 | |
2139 PyObject * | |
2140 PyUnicode_FromFormatV(const char *format, va_list vargs) | |
2141 { | |
2142 va_list count; | |
2143 Py_ssize_t callcount = 0; | |
2144 PyObject **callresults = NULL; | |
2145 PyObject **callresult = NULL; | |
2146 Py_ssize_t n = 0; | |
2147 int width = 0; | |
2148 int precision = 0; | |
2149 int zeropad; | |
2150 const char* f; | |
2151 PyObject *string; | |
2152 /* used by sprintf */ | |
2153 char fmt[61]; /* should be enough for %0width.precisionlld */ | |
2154 Py_UCS4 maxchar = 127; /* result is ASCII by default */ | |
2155 Py_UCS4 argmaxchar; | |
2156 Py_ssize_t numbersize = 0; | |
2157 char *numberresults = NULL; | |
2158 char *numberresult = NULL; | |
2159 Py_ssize_t i; | |
2160 int kind; | |
2161 void *data; | |
2162 | |
2163 Py_VA_COPY(count, vargs); | |
2164 /* step 1: count the number of %S/%R/%A/%s format specifications | |
2165 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ | |
2166 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the | |
2167 * result in an array) | |
2168 * also estimate a upper bound for all the number formats in the string, | |
2169 * numbers will be formatted in step 3 and be kept in a '\0'-separated | |
2170 * buffer before putting everything together. */ | |
2171 for (f = format; *f; f++) { | |
2172 if (*f == '%') { | |
2173 int longlongflag; | |
2174 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ | |
2175 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); | |
2176 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') | |
2177 ++callcount; | |
2178 | |
2179 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { | |
2180 #ifdef HAVE_LONG_LONG | |
2181 if (longlongflag) { | |
2182 if (width < MAX_LONG_LONG_CHARS) | |
2183 width = MAX_LONG_LONG_CHARS; | |
2184 } | |
2185 else | |
2186 #endif | |
2187 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, | |
2188 including sign. Decimal takes the most space. This | |
2189 isn't enough for octal. If a width is specified we | |
2190 need more (which we allocate later). */ | |
2191 if (width < MAX_LONG_CHARS) | |
2192 width = MAX_LONG_CHARS; | |
2193 | |
2194 /* account for the size + '\0' to separate numbers | |
2195 inside of the numberresults buffer */ | |
2196 numbersize += (width + 1); | |
2197 } | |
2198 } | |
2199 else if ((unsigned char)*f > 127) { | |
2200 PyErr_Format(PyExc_ValueError, | |
2201 "PyUnicode_FromFormatV() expects an ASCII-encoded format " | |
2202 "string, got a non-ASCII byte: 0x%02x", | |
2203 (unsigned char)*f); | |
2204 return NULL; | |
2205 } | |
2206 } | |
2207 /* step 2: allocate memory for the results of | |
2208 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ | |
2209 if (callcount) { | |
2210 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); | |
2211 if (!callresults) { | |
2212 PyErr_NoMemory(); | |
2213 return NULL; | |
2214 } | |
2215 callresult = callresults; | |
2216 } | |
2217 /* step 2.5: allocate memory for the results of formating numbers */ | |
2218 if (numbersize) { | |
2219 numberresults = PyObject_Malloc(numbersize); | |
2220 if (!numberresults) { | |
2221 PyErr_NoMemory(); | |
2222 goto fail; | |
2223 } | |
2224 numberresult = numberresults; | |
2225 } | |
2226 | |
2227 /* step 3: format numbers and figure out how large a buffer we need */ | |
2228 for (f = format; *f; f++) { | |
2229 if (*f == '%') { | |
2230 const char* p; | |
2231 int longflag; | |
2232 int longlongflag; | |
2233 int size_tflag; | |
2234 int numprinted; | |
2235 | |
2236 p = f; | |
2237 zeropad = (f[1] == '0'); | |
2238 f = parse_format_flags(f, &width, &precision, | |
2239 &longflag, &longlongflag, &size_tflag); | |
2240 switch (*f) { | |
2241 case 'c': | |
2242 { | |
2243 Py_UCS4 ordinal = va_arg(count, int); | |
2244 maxchar = Py_MAX(maxchar, ordinal); | |
2245 n++; | |
2246 break; | |
2247 } | |
2248 case '%': | |
2249 n++; | |
2250 break; | |
2251 case 'i': | |
2252 case 'd': | |
2253 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, | |
2254 width, precision, *f); | |
2255 if (longflag) | |
2256 numprinted = sprintf(numberresult, fmt, | |
2257 va_arg(count, long)); | |
2258 #ifdef HAVE_LONG_LONG | |
2259 else if (longlongflag) | |
2260 numprinted = sprintf(numberresult, fmt, | |
2261 va_arg(count, PY_LONG_LONG)); | |
2262 #endif | |
2263 else if (size_tflag) | |
2264 numprinted = sprintf(numberresult, fmt, | |
2265 va_arg(count, Py_ssize_t)); | |
2266 else | |
2267 numprinted = sprintf(numberresult, fmt, | |
2268 va_arg(count, int)); | |
2269 n += numprinted; | |
2270 /* advance by +1 to skip over the '\0' */ | |
2271 numberresult += (numprinted + 1); | |
2272 assert(*(numberresult - 1) == '\0'); | |
2273 assert(*(numberresult - 2) != '\0'); | |
2274 assert(numprinted >= 0); | |
2275 assert(numberresult <= numberresults + numbersize); | |
2276 break; | |
2277 case 'u': | |
2278 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, | |
2279 width, precision, 'u'); | |
2280 if (longflag) | |
2281 numprinted = sprintf(numberresult, fmt, | |
2282 va_arg(count, unsigned long)); | |
2283 #ifdef HAVE_LONG_LONG | |
2284 else if (longlongflag) | |
2285 numprinted = sprintf(numberresult, fmt, | |
2286 va_arg(count, unsigned PY_LONG_LONG)); | |
2287 #endif | |
2288 else if (size_tflag) | |
2289 numprinted = sprintf(numberresult, fmt, | |
2290 va_arg(count, size_t)); | |
2291 else | |
2292 numprinted = sprintf(numberresult, fmt, | |
2293 va_arg(count, unsigned int)); | |
2294 n += numprinted; | |
2295 numberresult += (numprinted + 1); | |
2296 assert(*(numberresult - 1) == '\0'); | |
2297 assert(*(numberresult - 2) != '\0'); | |
2298 assert(numprinted >= 0); | |
2299 assert(numberresult <= numberresults + numbersize); | |
2300 break; | |
2301 case 'x': | |
2302 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); | |
2303 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); | |
2304 n += numprinted; | |
2305 numberresult += (numprinted + 1); | |
2306 assert(*(numberresult - 1) == '\0'); | |
2307 assert(*(numberresult - 2) != '\0'); | |
2308 assert(numprinted >= 0); | |
2309 assert(numberresult <= numberresults + numbersize); | |
2310 break; | |
2311 case 'p': | |
2312 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); | |
2313 /* %p is ill-defined: ensure leading 0x. */ | |
2314 if (numberresult[1] == 'X') | |
2315 numberresult[1] = 'x'; | |
2316 else if (numberresult[1] != 'x') { | |
2317 memmove(numberresult + 2, numberresult, | |
2318 strlen(numberresult) + 1); | |
2319 numberresult[0] = '0'; | |
2320 numberresult[1] = 'x'; | |
2321 numprinted += 2; | |
2322 } | |
2323 n += numprinted; | |
2324 numberresult += (numprinted + 1); | |
2325 assert(*(numberresult - 1) == '\0'); | |
2326 assert(*(numberresult - 2) != '\0'); | |
2327 assert(numprinted >= 0); | |
2328 assert(numberresult <= numberresults + numbersize); | |
2329 break; | |
2330 case 's': | |
2331 { | |
2332 /* UTF-8 */ | |
2333 const char *s = va_arg(count, const char*); | |
2334 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); | |
2335 if (!str) | |
2336 goto fail; | |
2337 /* since PyUnicode_DecodeUTF8 returns already flexible | |
2338 unicode objects, there is no need to call ready on them */ | |
2339 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); | |
2340 maxchar = Py_MAX(maxchar, argmaxchar); | |
2341 n += PyUnicode_GET_LENGTH(str); | |
2342 /* Remember the str and switch to the next slot */ | |
2343 *callresult++ = str; | |
2344 break; | |
2345 } | |
2346 case 'U': | |
2347 { | |
2348 PyObject *obj = va_arg(count, PyObject *); | |
2349 assert(obj && _PyUnicode_CHECK(obj)); | |
2350 if (PyUnicode_READY(obj) == -1) | |
2351 goto fail; | |
2352 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); | |
2353 maxchar = Py_MAX(maxchar, argmaxchar); | |
2354 n += PyUnicode_GET_LENGTH(obj); | |
2355 break; | |
2356 } | |
2357 case 'V': | |
2358 { | |
2359 PyObject *obj = va_arg(count, PyObject *); | |
2360 const char *str = va_arg(count, const char *); | |
2361 PyObject *str_obj; | |
2362 assert(obj || str); | |
2363 assert(!obj || _PyUnicode_CHECK(obj)); | |
2364 if (obj) { | |
2365 if (PyUnicode_READY(obj) == -1) | |
2366 goto fail; | |
2367 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); | |
2368 maxchar = Py_MAX(maxchar, argmaxchar); | |
2369 n += PyUnicode_GET_LENGTH(obj); | |
2370 *callresult++ = NULL; | |
2371 } | |
2372 else { | |
2373 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); | |
2374 if (!str_obj) | |
2375 goto fail; | |
2376 if (PyUnicode_READY(str_obj)) { | |
2377 Py_DECREF(str_obj); | |
2378 goto fail; | |
2379 } | |
2380 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); | |
2381 maxchar = Py_MAX(maxchar, argmaxchar); | |
2382 n += PyUnicode_GET_LENGTH(str_obj); | |
2383 *callresult++ = str_obj; | |
2384 } | |
2385 break; | |
2386 } | |
2387 case 'S': | |
2388 { | |
2389 PyObject *obj = va_arg(count, PyObject *); | |
2390 PyObject *str; | |
2391 assert(obj); | |
2392 str = PyObject_Str(obj); | |
2393 if (!str || PyUnicode_READY(str) == -1) | |
2394 goto fail; | |
2395 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); | |
2396 maxchar = Py_MAX(maxchar, argmaxchar); | |
2397 n += PyUnicode_GET_LENGTH(str); | |
2398 /* Remember the str and switch to the next slot */ | |
2399 *callresult++ = str; | |
2400 break; | |
2401 } | |
2402 case 'R': | |
2403 { | |
2404 PyObject *obj = va_arg(count, PyObject *); | |
2405 PyObject *repr; | |
2406 assert(obj); | |
2407 repr = PyObject_Repr(obj); | |
2408 if (!repr || PyUnicode_READY(repr) == -1) | |
2409 goto fail; | |
2410 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); | |
2411 maxchar = Py_MAX(maxchar, argmaxchar); | |
2412 n += PyUnicode_GET_LENGTH(repr); | |
2413 /* Remember the repr and switch to the next slot */ | |
2414 *callresult++ = repr; | |
2415 break; | |
2416 } | |
2417 case 'A': | |
2418 { | |
2419 PyObject *obj = va_arg(count, PyObject *); | |
2420 PyObject *ascii; | |
2421 assert(obj); | |
2422 ascii = PyObject_ASCII(obj); | |
2423 if (!ascii || PyUnicode_READY(ascii) == -1) | |
2424 goto fail; | |
2425 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); | |
2426 maxchar = Py_MAX(maxchar, argmaxchar); | |
2427 n += PyUnicode_GET_LENGTH(ascii); | |
2428 /* Remember the repr and switch to the next slot */ | |
2429 *callresult++ = ascii; | |
2430 break; | |
2431 } | |
2432 default: | |
2433 /* if we stumble upon an unknown | |
2434 formatting code, copy the rest of | |
2435 the format string to the output | |
2436 string. (we cannot just skip the | |
2437 code, since there's no way to know | |
2438 what's in the argument list) */ | |
2439 n += strlen(p); | |
2440 goto expand; | |
2441 } | |
2442 } else | |
2443 n++; | |
2444 } | |
2445 expand: | |
2446 /* step 4: fill the buffer */ | |
2447 /* Since we've analyzed how much space we need, | |
2448 we don't have to resize the string. | |
2449 There can be no errors beyond this point. */ | |
2450 string = PyUnicode_New(n, maxchar); | |
2451 if (!string) | |
2452 goto fail; | |
2453 kind = PyUnicode_KIND(string); | |
2454 data = PyUnicode_DATA(string); | |
2455 callresult = callresults; | |
2456 numberresult = numberresults; | |
2457 | |
2458 for (i = 0, f = format; *f; f++) { | |
2459 if (*f == '%') { | |
2460 const char* p; | |
2461 | |
2462 p = f; | |
2463 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); | |
2464 /* checking for == because the last argument could be a empty | |
2465 string, which causes i to point to end, the assert at the end of | |
2466 the loop */ | |
2467 assert(i <= PyUnicode_GET_LENGTH(string)); | |
2468 | |
2469 switch (*f) { | |
2470 case 'c': | |
2471 { | |
2472 const int ordinal = va_arg(vargs, int); | |
2473 PyUnicode_WRITE(kind, data, i++, ordinal); | |
2474 break; | |
2475 } | |
2476 case 'i': | |
2477 case 'd': | |
2478 case 'u': | |
2479 case 'x': | |
2480 case 'p': | |
2481 /* unused, since we already have the result */ | |
2482 if (*f == 'p') | |
2483 (void) va_arg(vargs, void *); | |
2484 else | |
2485 (void) va_arg(vargs, int); | |
2486 /* extract the result from numberresults and append. */ | |
2487 for (; *numberresult; ++i, ++numberresult) | |
2488 PyUnicode_WRITE(kind, data, i, *numberresult); | |
2489 /* skip over the separating '\0' */ | |
2490 assert(*numberresult == '\0'); | |
2491 numberresult++; | |
2492 assert(numberresult <= numberresults + numbersize); | |
2493 break; | |
2494 case 's': | |
2495 { | |
2496 /* unused, since we already have the result */ | |
2497 Py_ssize_t size; | |
2498 (void) va_arg(vargs, char *); | |
2499 size = PyUnicode_GET_LENGTH(*callresult); | |
2500 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); | |
2501 copy_characters(string, i, *callresult, 0, size); | |
2502 i += size; | |
2503 /* We're done with the unicode()/repr() => forget it */ | |
2504 Py_DECREF(*callresult); | |
2505 /* switch to next unicode()/repr() result */ | |
2506 ++callresult; | |
2507 break; | |
2508 } | |
2509 case 'U': | |
2510 { | |
2511 PyObject *obj = va_arg(vargs, PyObject *); | |
2512 Py_ssize_t size; | |
2513 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); | |
2514 size = PyUnicode_GET_LENGTH(obj); | |
2515 copy_characters(string, i, obj, 0, size); | |
2516 i += size; | |
2517 break; | |
2518 } | |
2519 case 'V': | |
2520 { | |
2521 Py_ssize_t size; | |
2522 PyObject *obj = va_arg(vargs, PyObject *); | |
2523 va_arg(vargs, const char *); | |
2524 if (obj) { | |
2525 size = PyUnicode_GET_LENGTH(obj); | |
2526 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); | |
2527 copy_characters(string, i, obj, 0, size); | |
2528 i += size; | |
2529 } else { | |
2530 size = PyUnicode_GET_LENGTH(*callresult); | |
2531 assert(PyUnicode_KIND(*callresult) <= | |
2532 PyUnicode_KIND(string)); | |
2533 copy_characters(string, i, *callresult, 0, size); | |
2534 i += size; | |
2535 Py_DECREF(*callresult); | |
2536 } | |
2537 ++callresult; | |
2538 break; | |
2539 } | |
2540 case 'S': | |
2541 case 'R': | |
2542 case 'A': | |
2543 { | |
2544 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); | |
2545 /* unused, since we already have the result */ | |
2546 (void) va_arg(vargs, PyObject *); | |
2547 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); | |
2548 copy_characters(string, i, *callresult, 0, size); | |
2549 i += size; | |
2550 /* We're done with the unicode()/repr() => forget it */ | |
2551 Py_DECREF(*callresult); | |
2552 /* switch to next unicode()/repr() result */ | |
2553 ++callresult; | |
2554 break; | |
2555 } | |
2556 case '%': | |
2557 PyUnicode_WRITE(kind, data, i++, '%'); | |
2558 break; | |
2559 default: | |
2560 for (; *p; ++p, ++i) | |
2561 PyUnicode_WRITE(kind, data, i, *p); | |
2562 assert(i == PyUnicode_GET_LENGTH(string)); | |
2563 goto end; | |
2564 } | |
2565 } | |
2566 else { | |
2567 assert(i < PyUnicode_GET_LENGTH(string)); | |
2568 PyUnicode_WRITE(kind, data, i++, *f); | |
2569 } | |
2570 } | |
2571 assert(i == PyUnicode_GET_LENGTH(string)); | |
2572 | |
2573 end: | |
2574 if (callresults) | |
2575 PyObject_Free(callresults); | |
2576 if (numberresults) | |
2577 PyObject_Free(numberresults); | |
2578 return unicode_result(string); | |
2579 fail: | |
2580 if (callresults) { | |
2581 PyObject **callresult2 = callresults; | |
2582 while (callresult2 < callresult) { | |
2583 Py_XDECREF(*callresult2); | |
2584 ++callresult2; | |
2585 } | |
2586 PyObject_Free(callresults); | |
2587 } | |
2588 if (numberresults) | |
2589 PyObject_Free(numberresults); | |
2590 return NULL; | |
2591 } | |
2592 | |
2593 PyObject * | |
2594 PyUnicode_FromFormat(const char *format, ...) | |
2595 { | |
2596 PyObject* ret; | |
2597 va_list vargs; | |
2598 | |
2599 #ifdef HAVE_STDARG_PROTOTYPES | |
2600 va_start(vargs, format); | |
2601 #else | |
2602 va_start(vargs); | |
2603 #endif | |
2604 ret = PyUnicode_FromFormatV(format, vargs); | |
2605 va_end(vargs); | |
2606 return ret; | |
2607 } | |
2608 | |
2609 #ifdef HAVE_WCHAR_H | |
2610 | |
2611 /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): | |
2612 convert a Unicode object to a wide character string. | |
2613 | |
2614 - If w is NULL: return the number of wide characters (including the null | |
2615 character) required to convert the unicode object. Ignore size argument. | |
2616 | |
2617 - Otherwise: return the number of wide characters (excluding the null | |
2618 character) written into w. Write at most size wide characters (including | |
2619 the null character). */ | |
2620 static Py_ssize_t | |
2621 unicode_aswidechar(PyObject *unicode, | |
2622 wchar_t *w, | |
2623 Py_ssize_t size) | |
2624 { | |
2625 Py_ssize_t res; | |
2626 const wchar_t *wstr; | |
2627 | |
2628 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); | |
2629 if (wstr == NULL) | |
2630 return -1; | |
2631 | |
2632 if (w != NULL) { | |
2633 if (size > res) | |
2634 size = res + 1; | |
2635 else | |
2636 res = size; | |
2637 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); | |
2638 return res; | |
2639 } | |
2640 else | |
2641 return res + 1; | |
2642 } | |
2643 | |
2644 Py_ssize_t | |
2645 PyUnicode_AsWideChar(PyObject *unicode, | |
2646 wchar_t *w, | |
2647 Py_ssize_t size) | |
2648 { | |
2649 if (unicode == NULL) { | |
2650 PyErr_BadInternalCall(); | |
2651 return -1; | |
2652 } | |
2653 return unicode_aswidechar(unicode, w, size); | |
2654 } | |
2655 | |
2656 wchar_t* | |
2657 PyUnicode_AsWideCharString(PyObject *unicode, | |
2658 Py_ssize_t *size) | |
2659 { | |
2660 wchar_t* buffer; | |
2661 Py_ssize_t buflen; | |
2662 | |
2663 if (unicode == NULL) { | |
2664 PyErr_BadInternalCall(); | |
2665 return NULL; | |
2666 } | |
2667 | |
2668 buflen = unicode_aswidechar(unicode, NULL, 0); | |
2669 if (buflen == -1) | |
2670 return NULL; | |
2671 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { | |
2672 PyErr_NoMemory(); | |
2673 return NULL; | |
2674 } | |
2675 | |
2676 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); | |
2677 if (buffer == NULL) { | |
2678 PyErr_NoMemory(); | |
2679 return NULL; | |
2680 } | |
2681 buflen = unicode_aswidechar(unicode, buffer, buflen); | |
2682 if (buflen == -1) | |
2683 return NULL; | |
2684 if (size != NULL) | |
2685 *size = buflen; | |
2686 return buffer; | |
2687 } | |
2688 | |
2689 #endif /* HAVE_WCHAR_H */ | |
2690 | |
2691 PyObject * | |
2692 PyUnicode_FromOrdinal(int ordinal) | |
2693 { | |
2694 PyObject *v; | |
2695 if (ordinal < 0 || ordinal > MAX_UNICODE) { | |
2696 PyErr_SetString(PyExc_ValueError, | |
2697 "chr() arg not in range(0x110000)"); | |
2698 return NULL; | |
2699 } | |
2700 | |
2701 if (ordinal < 256) | |
2702 return get_latin1_char(ordinal); | |
2703 | |
2704 v = PyUnicode_New(1, ordinal); | |
2705 if (v == NULL) | |
2706 return NULL; | |
2707 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); | |
2708 assert(_PyUnicode_CheckConsistency(v, 1)); | |
2709 return v; | |
2710 } | |
2711 | |
2712 PyObject * | |
2713 PyUnicode_FromObject(register PyObject *obj) | |
2714 { | |
2715 /* XXX Perhaps we should make this API an alias of | |
2716 PyObject_Str() instead ?! */ | |
2717 if (PyUnicode_CheckExact(obj)) { | |
2718 if (PyUnicode_READY(obj)) | |
2719 return NULL; | |
2720 Py_INCREF(obj); | |
2721 return obj; | |
2722 } | |
2723 if (PyUnicode_Check(obj)) { | |
2724 /* For a Unicode subtype that's not a Unicode object, | |
2725 return a true Unicode object with the same data. */ | |
2726 return PyUnicode_Copy(obj); | |
2727 } | |
2728 PyErr_Format(PyExc_TypeError, | |
2729 "Can't convert '%.100s' object to str implicitly", | |
2730 Py_TYPE(obj)->tp_name); | |
2731 return NULL; | |
2732 } | |
2733 | |
2734 PyObject * | |
2735 PyUnicode_FromEncodedObject(register PyObject *obj, | |
2736 const char *encoding, | |
2737 const char *errors) | |
2738 { | |
2739 Py_buffer buffer; | |
2740 PyObject *v; | |
2741 | |
2742 if (obj == NULL) { | |
2743 PyErr_BadInternalCall(); | |
2744 return NULL; | |
2745 } | |
2746 | |
2747 /* Decoding bytes objects is the most common case and should be fast */ | |
2748 if (PyBytes_Check(obj)) { | |
2749 if (PyBytes_GET_SIZE(obj) == 0) { | |
2750 Py_INCREF(unicode_empty); | |
2751 v = unicode_empty; | |
2752 } | |
2753 else { | |
2754 v = PyUnicode_Decode( | |
2755 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), | |
2756 encoding, errors); | |
2757 } | |
2758 return v; | |
2759 } | |
2760 | |
2761 if (PyUnicode_Check(obj)) { | |
2762 PyErr_SetString(PyExc_TypeError, | |
2763 "decoding str is not supported"); | |
2764 return NULL; | |
2765 } | |
2766 | |
2767 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ | |
2768 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { | |
2769 PyErr_Format(PyExc_TypeError, | |
2770 "coercing to str: need bytes, bytearray " | |
2771 "or buffer-like object, %.80s found", | |
2772 Py_TYPE(obj)->tp_name); | |
2773 return NULL; | |
2774 } | |
2775 | |
2776 if (buffer.len == 0) { | |
2777 Py_INCREF(unicode_empty); | |
2778 v = unicode_empty; | |
2779 } | |
2780 else | |
2781 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); | |
2782 | |
2783 PyBuffer_Release(&buffer); | |
2784 return v; | |
2785 } | |
2786 | |
2787 /* Convert encoding to lower case and replace '_' with '-' in order to | |
2788 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), | |
2789 1 on success. */ | |
2790 static int | |
2791 normalize_encoding(const char *encoding, | |
2792 char *lower, | |
2793 size_t lower_len) | |
2794 { | |
2795 const char *e; | |
2796 char *l; | |
2797 char *l_end; | |
2798 | |
2799 if (encoding == NULL) { | |
2800 strcpy(lower, "utf-8"); | |
2801 return 1; | |
2802 } | |
2803 e = encoding; | |
2804 l = lower; | |
2805 l_end = &lower[lower_len - 1]; | |
2806 while (*e) { | |
2807 if (l == l_end) | |
2808 return 0; | |
2809 if (Py_ISUPPER(*e)) { | |
2810 *l++ = Py_TOLOWER(*e++); | |
2811 } | |
2812 else if (*e == '_') { | |
2813 *l++ = '-'; | |
2814 e++; | |
2815 } | |
2816 else { | |
2817 *l++ = *e++; | |
2818 } | |
2819 } | |
2820 *l = '\0'; | |
2821 return 1; | |
2822 } | |
2823 | |
2824 PyObject * | |
2825 PyUnicode_Decode(const char *s, | |
2826 Py_ssize_t size, | |
2827 const char *encoding, | |
2828 const char *errors) | |
2829 { | |
2830 PyObject *buffer = NULL, *unicode; | |
2831 Py_buffer info; | |
2832 char lower[11]; /* Enough for any encoding shortcut */ | |
2833 | |
2834 /* Shortcuts for common default encodings */ | |
2835 if (normalize_encoding(encoding, lower, sizeof(lower))) { | |
2836 if ((strcmp(lower, "utf-8") == 0) || | |
2837 (strcmp(lower, "utf8") == 0)) | |
2838 return PyUnicode_DecodeUTF8(s, size, errors); | |
2839 else if ((strcmp(lower, "latin-1") == 0) || | |
2840 (strcmp(lower, "latin1") == 0) || | |
2841 (strcmp(lower, "iso-8859-1") == 0)) | |
2842 return PyUnicode_DecodeLatin1(s, size, errors); | |
2843 #ifdef HAVE_MBCS | |
2844 else if (strcmp(lower, "mbcs") == 0) | |
2845 return PyUnicode_DecodeMBCS(s, size, errors); | |
2846 #endif | |
2847 else if (strcmp(lower, "ascii") == 0) | |
2848 return PyUnicode_DecodeASCII(s, size, errors); | |
2849 else if (strcmp(lower, "utf-16") == 0) | |
2850 return PyUnicode_DecodeUTF16(s, size, errors, 0); | |
2851 else if (strcmp(lower, "utf-32") == 0) | |
2852 return PyUnicode_DecodeUTF32(s, size, errors, 0); | |
2853 } | |
2854 | |
2855 /* Decode via the codec registry */ | |
2856 buffer = NULL; | |
2857 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) | |
2858 goto onError; | |
2859 buffer = PyMemoryView_FromBuffer(&info); | |
2860 if (buffer == NULL) | |
2861 goto onError; | |
2862 unicode = PyCodec_Decode(buffer, encoding, errors); | |
2863 if (unicode == NULL) | |
2864 goto onError; | |
2865 if (!PyUnicode_Check(unicode)) { | |
2866 PyErr_Format(PyExc_TypeError, | |
2867 "decoder did not return a str object (type=%.400s)", | |
2868 Py_TYPE(unicode)->tp_name); | |
2869 Py_DECREF(unicode); | |
2870 goto onError; | |
2871 } | |
2872 Py_DECREF(buffer); | |
2873 return unicode_result(unicode); | |
2874 | |
2875 onError: | |
2876 Py_XDECREF(buffer); | |
2877 return NULL; | |
2878 } | |
2879 | |
2880 PyObject * | |
2881 PyUnicode_AsDecodedObject(PyObject *unicode, | |
2882 const char *encoding, | |
2883 const char *errors) | |
2884 { | |
2885 PyObject *v; | |
2886 | |
2887 if (!PyUnicode_Check(unicode)) { | |
2888 PyErr_BadArgument(); | |
2889 goto onError; | |
2890 } | |
2891 | |
2892 if (encoding == NULL) | |
2893 encoding = PyUnicode_GetDefaultEncoding(); | |
2894 | |
2895 /* Decode via the codec registry */ | |
2896 v = PyCodec_Decode(unicode, encoding, errors); | |
2897 if (v == NULL) | |
2898 goto onError; | |
2899 return unicode_result(v); | |
2900 | |
2901 onError: | |
2902 return NULL; | |
2903 } | |
2904 | |
2905 PyObject * | |
2906 PyUnicode_AsDecodedUnicode(PyObject *unicode, | |
2907 const char *encoding, | |
2908 const char *errors) | |
2909 { | |
2910 PyObject *v; | |
2911 | |
2912 if (!PyUnicode_Check(unicode)) { | |
2913 PyErr_BadArgument(); | |
2914 goto onError; | |
2915 } | |
2916 | |
2917 if (encoding == NULL) | |
2918 encoding = PyUnicode_GetDefaultEncoding(); | |
2919 | |
2920 /* Decode via the codec registry */ | |
2921 v = PyCodec_Decode(unicode, encoding, errors); | |
2922 if (v == NULL) | |
2923 goto onError; | |
2924 if (!PyUnicode_Check(v)) { | |
2925 PyErr_Format(PyExc_TypeError, | |
2926 "decoder did not return a str object (type=%.400s)", | |
2927 Py_TYPE(v)->tp_name); | |
2928 Py_DECREF(v); | |
2929 goto onError; | |
2930 } | |
2931 return unicode_result(v); | |
2932 | |
2933 onError: | |
2934 return NULL; | |
2935 } | |
2936 | |
2937 PyObject * | |
2938 PyUnicode_Encode(const Py_UNICODE *s, | |
2939 Py_ssize_t size, | |
2940 const char *encoding, | |
2941 const char *errors) | |
2942 { | |
2943 PyObject *v, *unicode; | |
2944 | |
2945 unicode = PyUnicode_FromUnicode(s, size); | |
2946 if (unicode == NULL) | |
2947 return NULL; | |
2948 v = PyUnicode_AsEncodedString(unicode, encoding, errors); | |
2949 Py_DECREF(unicode); | |
2950 return v; | |
2951 } | |
2952 | |
2953 PyObject * | |
2954 PyUnicode_AsEncodedObject(PyObject *unicode, | |
2955 const char *encoding, | |
2956 const char *errors) | |
2957 { | |
2958 PyObject *v; | |
2959 | |
2960 if (!PyUnicode_Check(unicode)) { | |
2961 PyErr_BadArgument(); | |
2962 goto onError; | |
2963 } | |
2964 | |
2965 if (encoding == NULL) | |
2966 encoding = PyUnicode_GetDefaultEncoding(); | |
2967 | |
2968 /* Encode via the codec registry */ | |
2969 v = PyCodec_Encode(unicode, encoding, errors); | |
2970 if (v == NULL) | |
2971 goto onError; | |
2972 return v; | |
2973 | |
2974 onError: | |
2975 return NULL; | |
2976 } | |
2977 | |
2978 PyObject * | |
2979 PyUnicode_EncodeFSDefault(PyObject *unicode) | |
2980 { | |
2981 #ifdef HAVE_MBCS | |
2982 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); | |
2983 #elif defined(__APPLE__) | |
2984 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); | |
2985 #else | |
2986 PyInterpreterState *interp = PyThreadState_GET()->interp; | |
2987 /* Bootstrap check: if the filesystem codec is implemented in Python, we | |
2988 cannot use it to encode and decode filenames before it is loaded. Load | |
2989 the Python codec requires to encode at least its own filename. Use the C | |
2990 version of the locale codec until the codec registry is initialized and | |
2991 the Python codec is loaded. | |
2992 | |
2993 Py_FileSystemDefaultEncoding is shared between all interpreters, we | |
2994 cannot only rely on it: check also interp->fscodec_initialized for | |
2995 subinterpreters. */ | |
2996 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { | |
2997 return PyUnicode_AsEncodedString(unicode, | |
2998 Py_FileSystemDefaultEncoding, | |
2999 "surrogateescape"); | |
3000 } | |
3001 else { | |
3002 /* locale encoding with surrogateescape */ | |
3003 wchar_t *wchar; | |
3004 char *bytes; | |
3005 PyObject *bytes_obj; | |
3006 size_t error_pos; | |
3007 | |
3008 wchar = PyUnicode_AsWideCharString(unicode, NULL); | |
3009 if (wchar == NULL) | |
3010 return NULL; | |
3011 bytes = _Py_wchar2char(wchar, &error_pos); | |
3012 if (bytes == NULL) { | |
3013 if (error_pos != (size_t)-1) { | |
3014 char *errmsg = strerror(errno); | |
3015 PyObject *exc = NULL; | |
3016 if (errmsg == NULL) | |
3017 errmsg = "Py_wchar2char() failed"; | |
3018 raise_encode_exception(&exc, | |
3019 "filesystemencoding", unicode, | |
3020 error_pos, error_pos+1, | |
3021 errmsg); | |
3022 Py_XDECREF(exc); | |
3023 } | |
3024 else | |
3025 PyErr_NoMemory(); | |
3026 PyMem_Free(wchar); | |
3027 return NULL; | |
3028 } | |
3029 PyMem_Free(wchar); | |
3030 | |
3031 bytes_obj = PyBytes_FromString(bytes); | |
3032 PyMem_Free(bytes); | |
3033 return bytes_obj; | |
3034 } | |
3035 #endif | |
3036 } | |
3037 | |
3038 PyObject * | |
3039 PyUnicode_AsEncodedString(PyObject *unicode, | |
3040 const char *encoding, | |
3041 const char *errors) | |
3042 { | |
3043 PyObject *v; | |
3044 char lower[11]; /* Enough for any encoding shortcut */ | |
3045 | |
3046 if (!PyUnicode_Check(unicode)) { | |
3047 PyErr_BadArgument(); | |
3048 return NULL; | |
3049 } | |
3050 | |
3051 /* Shortcuts for common default encodings */ | |
3052 if (normalize_encoding(encoding, lower, sizeof(lower))) { | |
3053 if ((strcmp(lower, "utf-8") == 0) || | |
3054 (strcmp(lower, "utf8") == 0)) | |
3055 { | |
3056 if (errors == NULL || strcmp(errors, "strict") == 0) | |
3057 return _PyUnicode_AsUTF8String(unicode, NULL); | |
3058 else | |
3059 return _PyUnicode_AsUTF8String(unicode, errors); | |
3060 } | |
3061 else if ((strcmp(lower, "latin-1") == 0) || | |
3062 (strcmp(lower, "latin1") == 0) || | |
3063 (strcmp(lower, "iso-8859-1") == 0)) | |
3064 return _PyUnicode_AsLatin1String(unicode, errors); | |
3065 #ifdef HAVE_MBCS | |
3066 else if (strcmp(lower, "mbcs") == 0) | |
3067 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); | |
3068 #endif | |
3069 else if (strcmp(lower, "ascii") == 0) | |
3070 return _PyUnicode_AsASCIIString(unicode, errors); | |
3071 } | |
3072 | |
3073 /* Encode via the codec registry */ | |
3074 v = PyCodec_Encode(unicode, encoding, errors); | |
3075 if (v == NULL) | |
3076 return NULL; | |
3077 | |
3078 /* The normal path */ | |
3079 if (PyBytes_Check(v)) | |
3080 return v; | |
3081 | |
3082 /* If the codec returns a buffer, raise a warning and convert to bytes */ | |
3083 if (PyByteArray_Check(v)) { | |
3084 int error; | |
3085 PyObject *b; | |
3086 | |
3087 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, | |
3088 "encoder %s returned bytearray instead of bytes", | |
3089 encoding); | |
3090 if (error) { | |
3091 Py_DECREF(v); | |
3092 return NULL; | |
3093 } | |
3094 | |
3095 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); | |
3096 Py_DECREF(v); | |
3097 return b; | |
3098 } | |
3099 | |
3100 PyErr_Format(PyExc_TypeError, | |
3101 "encoder did not return a bytes object (type=%.400s)", | |
3102 Py_TYPE(v)->tp_name); | |
3103 Py_DECREF(v); | |
3104 return NULL; | |
3105 } | |
3106 | |
3107 PyObject * | |
3108 PyUnicode_AsEncodedUnicode(PyObject *unicode, | |
3109 const char *encoding, | |
3110 const char *errors) | |
3111 { | |
3112 PyObject *v; | |
3113 | |
3114 if (!PyUnicode_Check(unicode)) { | |
3115 PyErr_BadArgument(); | |
3116 goto onError; | |
3117 } | |
3118 | |
3119 if (encoding == NULL) | |
3120 encoding = PyUnicode_GetDefaultEncoding(); | |
3121 | |
3122 /* Encode via the codec registry */ | |
3123 v = PyCodec_Encode(unicode, encoding, errors); | |
3124 if (v == NULL) | |
3125 goto onError; | |
3126 if (!PyUnicode_Check(v)) { | |
3127 PyErr_Format(PyExc_TypeError, | |
3128 "encoder did not return an str object (type=%.400s)", | |
3129 Py_TYPE(v)->tp_name); | |
3130 Py_DECREF(v); | |
3131 goto onError; | |
3132 } | |
3133 return v; | |
3134 | |
3135 onError: | |
3136 return NULL; | |
3137 } | |
3138 | |
3139 PyObject* | |
3140 PyUnicode_DecodeFSDefault(const char *s) { | |
3141 Py_ssize_t size = (Py_ssize_t)strlen(s); | |
3142 return PyUnicode_DecodeFSDefaultAndSize(s, size); | |
3143 } | |
3144 | |
3145 PyObject* | |
3146 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) | |
3147 { | |
3148 #ifdef HAVE_MBCS | |
3149 return PyUnicode_DecodeMBCS(s, size, NULL); | |
3150 #elif defined(__APPLE__) | |
3151 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); | |
3152 #else | |
3153 PyInterpreterState *interp = PyThreadState_GET()->interp; | |
3154 /* Bootstrap check: if the filesystem codec is implemented in Python, we | |
3155 cannot use it to encode and decode filenames before it is loaded. Load | |
3156 the Python codec requires to encode at least its own filename. Use the C | |
3157 version of the locale codec until the codec registry is initialized and | |
3158 the Python codec is loaded. | |
3159 | |
3160 Py_FileSystemDefaultEncoding is shared between all interpreters, we | |
3161 cannot only rely on it: check also interp->fscodec_initialized for | |
3162 subinterpreters. */ | |
3163 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { | |
3164 return PyUnicode_Decode(s, size, | |
3165 Py_FileSystemDefaultEncoding, | |
3166 "surrogateescape"); | |
3167 } | |
3168 else { | |
3169 /* locale encoding with surrogateescape */ | |
3170 wchar_t *wchar; | |
3171 PyObject *unicode; | |
3172 size_t len; | |
3173 | |
3174 if (s[size] != '\0' || size != strlen(s)) { | |
3175 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); | |
3176 return NULL; | |
3177 } | |
3178 | |
3179 wchar = _Py_char2wchar(s, &len); | |
3180 if (wchar == NULL) | |
3181 return PyErr_NoMemory(); | |
3182 | |
3183 unicode = PyUnicode_FromWideChar(wchar, len); | |
3184 PyMem_Free(wchar); | |
3185 return unicode; | |
3186 } | |
3187 #endif | |
3188 } | |
3189 | |
3190 | |
3191 int | |
3192 PyUnicode_FSConverter(PyObject* arg, void* addr) | |
3193 { | |
3194 PyObject *output = NULL; | |
3195 Py_ssize_t size; | |
3196 void *data; | |
3197 if (arg == NULL) { | |
3198 Py_DECREF(*(PyObject**)addr); | |
3199 return 1; | |
3200 } | |
3201 if (PyBytes_Check(arg)) { | |
3202 output = arg; | |
3203 Py_INCREF(output); | |
3204 } | |
3205 else { | |
3206 arg = PyUnicode_FromObject(arg); | |
3207 if (!arg) | |
3208 return 0; | |
3209 output = PyUnicode_EncodeFSDefault(arg); | |
3210 Py_DECREF(arg); | |
3211 if (!output) | |
3212 return 0; | |
3213 if (!PyBytes_Check(output)) { | |
3214 Py_DECREF(output); | |
3215 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); | |
3216 return 0; | |
3217 } | |
3218 } | |
3219 size = PyBytes_GET_SIZE(output); | |
3220 data = PyBytes_AS_STRING(output); | |
3221 if (size != strlen(data)) { | |
3222 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); | |
3223 Py_DECREF(output); | |
3224 return 0; | |
3225 } | |
3226 *(PyObject**)addr = output; | |
3227 return Py_CLEANUP_SUPPORTED; | |
3228 } | |
3229 | |
3230 | |
3231 int | |
3232 PyUnicode_FSDecoder(PyObject* arg, void* addr) | |
3233 { | |
3234 PyObject *output = NULL; | |
3235 if (arg == NULL) { | |
3236 Py_DECREF(*(PyObject**)addr); | |
3237 return 1; | |
3238 } | |
3239 if (PyUnicode_Check(arg)) { | |
3240 if (PyUnicode_READY(arg)) | |
3241 return 0; | |
3242 output = arg; | |
3243 Py_INCREF(output); | |
3244 } | |
3245 else { | |
3246 arg = PyBytes_FromObject(arg); | |
3247 if (!arg) | |
3248 return 0; | |
3249 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), | |
3250 PyBytes_GET_SIZE(arg)); | |
3251 Py_DECREF(arg); | |
3252 if (!output) | |
3253 return 0; | |
3254 if (!PyUnicode_Check(output)) { | |
3255 Py_DECREF(output); | |
3256 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); | |
3257 return 0; | |
3258 } | |
3259 } | |
3260 if (PyUnicode_READY(output) < 0) { | |
3261 Py_DECREF(output); | |
3262 return 0; | |
3263 } | |
3264 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), | |
3265 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { | |
3266 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); | |
3267 Py_DECREF(output); | |
3268 return 0; | |
3269 } | |
3270 *(PyObject**)addr = output; | |
3271 return Py_CLEANUP_SUPPORTED; | |
3272 } | |
3273 | |
3274 | |
3275 char* | |
3276 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) | |
3277 { | |
3278 PyObject *bytes; | |
3279 | |
3280 if (!PyUnicode_Check(unicode)) { | |
3281 PyErr_BadArgument(); | |
3282 return NULL; | |
3283 } | |
3284 if (PyUnicode_READY(unicode) == -1) | |
3285 return NULL; | |
3286 | |
3287 if (PyUnicode_UTF8(unicode) == NULL) { | |
3288 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); | |
3289 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); | |
3290 if (bytes == NULL) | |
3291 return NULL; | |
3292 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); | |
3293 if (_PyUnicode_UTF8(unicode) == NULL) { | |
3294 Py_DECREF(bytes); | |
3295 return NULL; | |
3296 } | |
3297 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); | |
3298 Py_MEMCPY(_PyUnicode_UTF8(unicode), | |
3299 PyBytes_AS_STRING(bytes), | |
3300 _PyUnicode_UTF8_LENGTH(unicode) + 1); | |
3301 Py_DECREF(bytes); | |
3302 } | |
3303 | |
3304 if (psize) | |
3305 *psize = PyUnicode_UTF8_LENGTH(unicode); | |
3306 return PyUnicode_UTF8(unicode); | |
3307 } | |
3308 | |
3309 char* | |
3310 PyUnicode_AsUTF8(PyObject *unicode) | |
3311 { | |
3312 return PyUnicode_AsUTF8AndSize(unicode, NULL); | |
3313 } | |
3314 | |
3315 #ifdef Py_DEBUG | |
3316 static int unicode_as_unicode_calls = 0; | |
3317 #endif | |
3318 | |
3319 | |
3320 Py_UNICODE * | |
3321 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) | |
3322 { | |
3323 const unsigned char *one_byte; | |
3324 #if SIZEOF_WCHAR_T == 4 | |
3325 const Py_UCS2 *two_bytes; | |
3326 #else | |
3327 const Py_UCS4 *four_bytes; | |
3328 const Py_UCS4 *ucs4_end; | |
3329 Py_ssize_t num_surrogates; | |
3330 #endif | |
3331 wchar_t *w; | |
3332 wchar_t *wchar_end; | |
3333 | |
3334 if (!PyUnicode_Check(unicode)) { | |
3335 PyErr_BadArgument(); | |
3336 return NULL; | |
3337 } | |
3338 if (_PyUnicode_WSTR(unicode) == NULL) { | |
3339 /* Non-ASCII compact unicode object */ | |
3340 assert(_PyUnicode_KIND(unicode) != 0); | |
3341 assert(PyUnicode_IS_READY(unicode)); | |
3342 | |
3343 #ifdef Py_DEBUG | |
3344 ++unicode_as_unicode_calls; | |
3345 #endif | |
3346 | |
3347 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { | |
3348 #if SIZEOF_WCHAR_T == 2 | |
3349 four_bytes = PyUnicode_4BYTE_DATA(unicode); | |
3350 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); | |
3351 num_surrogates = 0; | |
3352 | |
3353 for (; four_bytes < ucs4_end; ++four_bytes) { | |
3354 if (*four_bytes > 0xFFFF) | |
3355 ++num_surrogates; | |
3356 } | |
3357 | |
3358 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( | |
3359 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); | |
3360 if (!_PyUnicode_WSTR(unicode)) { | |
3361 PyErr_NoMemory(); | |
3362 return NULL; | |
3363 } | |
3364 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; | |
3365 | |
3366 w = _PyUnicode_WSTR(unicode); | |
3367 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); | |
3368 four_bytes = PyUnicode_4BYTE_DATA(unicode); | |
3369 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { | |
3370 if (*four_bytes > 0xFFFF) { | |
3371 assert(*four_bytes <= MAX_UNICODE); | |
3372 /* encode surrogate pair in this case */ | |
3373 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); | |
3374 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); | |
3375 } | |
3376 else | |
3377 *w = *four_bytes; | |
3378 | |
3379 if (w > wchar_end) { | |
3380 assert(0 && "Miscalculated string end"); | |
3381 } | |
3382 } | |
3383 *w = 0; | |
3384 #else | |
3385 /* sizeof(wchar_t) == 4 */ | |
3386 Py_FatalError("Impossible unicode object state, wstr and str " | |
3387 "should share memory already."); | |
3388 return NULL; | |
3389 #endif | |
3390 } | |
3391 else { | |
3392 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * | |
3393 (_PyUnicode_LENGTH(unicode) + 1)); | |
3394 if (!_PyUnicode_WSTR(unicode)) { | |
3395 PyErr_NoMemory(); | |
3396 return NULL; | |
3397 } | |
3398 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) | |
3399 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); | |
3400 w = _PyUnicode_WSTR(unicode); | |
3401 wchar_end = w + _PyUnicode_LENGTH(unicode); | |
3402 | |
3403 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { | |
3404 one_byte = PyUnicode_1BYTE_DATA(unicode); | |
3405 for (; w < wchar_end; ++one_byte, ++w) | |
3406 *w = *one_byte; | |
3407 /* null-terminate the wstr */ | |
3408 *w = 0; | |
3409 } | |
3410 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { | |
3411 #if SIZEOF_WCHAR_T == 4 | |
3412 two_bytes = PyUnicode_2BYTE_DATA(unicode); | |
3413 for (; w < wchar_end; ++two_bytes, ++w) | |
3414 *w = *two_bytes; | |
3415 /* null-terminate the wstr */ | |
3416 *w = 0; | |
3417 #else | |
3418 /* sizeof(wchar_t) == 2 */ | |
3419 PyObject_FREE(_PyUnicode_WSTR(unicode)); | |
3420 _PyUnicode_WSTR(unicode) = NULL; | |
3421 Py_FatalError("Impossible unicode object state, wstr " | |
3422 "and str should share memory already."); | |
3423 return NULL; | |
3424 #endif | |
3425 } | |
3426 else { | |
3427 assert(0 && "This should never happen."); | |
3428 } | |
3429 } | |
3430 } | |
3431 if (size != NULL) | |
3432 *size = PyUnicode_WSTR_LENGTH(unicode); | |
3433 return _PyUnicode_WSTR(unicode); | |
3434 } | |
3435 | |
3436 Py_UNICODE * | |
3437 PyUnicode_AsUnicode(PyObject *unicode) | |
3438 { | |
3439 return PyUnicode_AsUnicodeAndSize(unicode, NULL); | |
3440 } | |
3441 | |
3442 | |
3443 Py_ssize_t | |
3444 PyUnicode_GetSize(PyObject *unicode) | |
3445 { | |
3446 if (!PyUnicode_Check(unicode)) { | |
3447 PyErr_BadArgument(); | |
3448 goto onError; | |
3449 } | |
3450 return PyUnicode_GET_SIZE(unicode); | |
3451 | |
3452 onError: | |
3453 return -1; | |
3454 } | |
3455 | |
3456 Py_ssize_t | |
3457 PyUnicode_GetLength(PyObject *unicode) | |
3458 { | |
3459 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { | |
3460 PyErr_BadArgument(); | |
3461 return -1; | |
3462 } | |
3463 | |
3464 return PyUnicode_GET_LENGTH(unicode); | |
3465 } | |
3466 | |
3467 Py_UCS4 | |
3468 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) | |
3469 { | |
3470 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { | |
3471 PyErr_BadArgument(); | |
3472 return (Py_UCS4)-1; | |
3473 } | |
3474 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { | |
3475 PyErr_SetString(PyExc_IndexError, "string index out of range"); | |
3476 return (Py_UCS4)-1; | |
3477 } | |
3478 return PyUnicode_READ_CHAR(unicode, index); | |
3479 } | |
3480 | |
3481 int | |
3482 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) | |
3483 { | |
3484 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { | |
3485 PyErr_BadArgument(); | |
3486 return -1; | |
3487 } | |
3488 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { | |
3489 PyErr_SetString(PyExc_IndexError, "string index out of range"); | |
3490 return -1; | |
3491 } | |
3492 if (_PyUnicode_Dirty(unicode)) | |
3493 return -1; | |
3494 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), | |
3495 index, ch); | |
3496 return 0; | |
3497 } | |
3498 | |
3499 const char * | |
3500 PyUnicode_GetDefaultEncoding(void) | |
3501 { | |
3502 return "utf-8"; | |
3503 } | |
3504 | |
3505 /* create or adjust a UnicodeDecodeError */ | |
3506 static void | |
3507 make_decode_exception(PyObject **exceptionObject, | |
3508 const char *encoding, | |
3509 const char *input, Py_ssize_t length, | |
3510 Py_ssize_t startpos, Py_ssize_t endpos, | |
3511 const char *reason) | |
3512 { | |
3513 if (*exceptionObject == NULL) { | |
3514 *exceptionObject = PyUnicodeDecodeError_Create( | |
3515 encoding, input, length, startpos, endpos, reason); | |
3516 } | |
3517 else { | |
3518 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) | |
3519 goto onError; | |
3520 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) | |
3521 goto onError; | |
3522 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) | |
3523 goto onError; | |
3524 } | |
3525 return; | |
3526 | |
3527 onError: | |
3528 Py_DECREF(*exceptionObject); | |
3529 *exceptionObject = NULL; | |
3530 } | |
3531 | |
3532 /* error handling callback helper: | |
3533 build arguments, call the callback and check the arguments, | |
3534 if no exception occurred, copy the replacement to the output | |
3535 and adjust various state variables. | |
3536 return 0 on success, -1 on error | |
3537 */ | |
3538 | |
3539 static int | |
3540 unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, | |
3541 const char *encoding, const char *reason, | |
3542 const char **input, const char **inend, Py_ssize_t *startinpos, | |
3543 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, | |
3544 PyObject **output, Py_ssize_t *outpos) | |
3545 { | |
3546 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; | |
3547 | |
3548 PyObject *restuple = NULL; | |
3549 PyObject *repunicode = NULL; | |
3550 Py_ssize_t outsize; | |
3551 Py_ssize_t insize; | |
3552 Py_ssize_t requiredsize; | |
3553 Py_ssize_t newpos; | |
3554 PyObject *inputobj = NULL; | |
3555 int res = -1; | |
3556 | |
3557 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) | |
3558 outsize = PyUnicode_GET_LENGTH(*output); | |
3559 else | |
3560 outsize = _PyUnicode_WSTR_LENGTH(*output); | |
3561 | |
3562 if (*errorHandler == NULL) { | |
3563 *errorHandler = PyCodec_LookupError(errors); | |
3564 if (*errorHandler == NULL) | |
3565 goto onError; | |
3566 } | |
3567 | |
3568 make_decode_exception(exceptionObject, | |
3569 encoding, | |
3570 *input, *inend - *input, | |
3571 *startinpos, *endinpos, | |
3572 reason); | |
3573 if (*exceptionObject == NULL) | |
3574 goto onError; | |
3575 | |
3576 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); | |
3577 if (restuple == NULL) | |
3578 goto onError; | |
3579 if (!PyTuple_Check(restuple)) { | |
3580 PyErr_SetString(PyExc_TypeError, &argparse[4]); | |
3581 goto onError; | |
3582 } | |
3583 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) | |
3584 goto onError; | |
3585 if (PyUnicode_READY(repunicode) < 0) | |
3586 goto onError; | |
3587 | |
3588 /* Copy back the bytes variables, which might have been modified by the | |
3589 callback */ | |
3590 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); | |
3591 if (!inputobj) | |
3592 goto onError; | |
3593 if (!PyBytes_Check(inputobj)) { | |
3594 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); | |
3595 } | |
3596 *input = PyBytes_AS_STRING(inputobj); | |
3597 insize = PyBytes_GET_SIZE(inputobj); | |
3598 *inend = *input + insize; | |
3599 /* we can DECREF safely, as the exception has another reference, | |
3600 so the object won't go away. */ | |
3601 Py_DECREF(inputobj); | |
3602 | |
3603 if (newpos<0) | |
3604 newpos = insize+newpos; | |
3605 if (newpos<0 || newpos>insize) { | |
3606 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); | |
3607 goto onError; | |
3608 } | |
3609 | |
3610 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { | |
3611 /* need more space? (at least enough for what we | |
3612 have+the replacement+the rest of the string (starting | |
3613 at the new input position), so we won't have to check space | |
3614 when there are no errors in the rest of the string) */ | |
3615 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); | |
3616 requiredsize = *outpos + replen + insize-newpos; | |
3617 if (requiredsize > outsize) { | |
3618 if (requiredsize<2*outsize) | |
3619 requiredsize = 2*outsize; | |
3620 if (unicode_resize(output, requiredsize) < 0) | |
3621 goto onError; | |
3622 } | |
3623 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0) | |
3624 goto onError; | |
3625 copy_characters(*output, *outpos, repunicode, 0, replen); | |
3626 *outpos += replen; | |
3627 } | |
3628 else { | |
3629 wchar_t *repwstr; | |
3630 Py_ssize_t repwlen; | |
3631 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); | |
3632 if (repwstr == NULL) | |
3633 goto onError; | |
3634 /* need more space? (at least enough for what we | |
3635 have+the replacement+the rest of the string (starting | |
3636 at the new input position), so we won't have to check space | |
3637 when there are no errors in the rest of the string) */ | |
3638 requiredsize = *outpos + repwlen + insize-newpos; | |
3639 if (requiredsize > outsize) { | |
3640 if (requiredsize < 2*outsize) | |
3641 requiredsize = 2*outsize; | |
3642 if (unicode_resize(output, requiredsize) < 0) | |
3643 goto onError; | |
3644 } | |
3645 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); | |
3646 *outpos += repwlen; | |
3647 } | |
3648 *endinpos = newpos; | |
3649 *inptr = *input + newpos; | |
3650 | |
3651 /* we made it! */ | |
3652 res = 0; | |
3653 | |
3654 onError: | |
3655 Py_XDECREF(restuple); | |
3656 return res; | |
3657 } | |
3658 | |
3659 /* --- UTF-7 Codec -------------------------------------------------------- */ | |
3660 | |
3661 /* See RFC2152 for details. We encode conservatively and decode liberally. */ | |
3662 | |
3663 /* Three simple macros defining base-64. */ | |
3664 | |
3665 /* Is c a base-64 character? */ | |
3666 | |
3667 #define IS_BASE64(c) \ | |
3668 (((c) >= 'A' && (c) <= 'Z') || \ | |
3669 ((c) >= 'a' && (c) <= 'z') || \ | |
3670 ((c) >= '0' && (c) <= '9') || \ | |
3671 (c) == '+' || (c) == '/') | |
3672 | |
3673 /* given that c is a base-64 character, what is its base-64 value? */ | |
3674 | |
3675 #define FROM_BASE64(c) \ | |
3676 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ | |
3677 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ | |
3678 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ | |
3679 (c) == '+' ? 62 : 63) | |
3680 | |
3681 /* What is the base-64 character of the bottom 6 bits of n? */ | |
3682 | |
3683 #define TO_BASE64(n) \ | |
3684 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) | |
3685 | |
3686 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be | |
3687 * decoded as itself. We are permissive on decoding; the only ASCII | |
3688 * byte not decoding to itself is the + which begins a base64 | |
3689 * string. */ | |
3690 | |
3691 #define DECODE_DIRECT(c) \ | |
3692 ((c) <= 127 && (c) != '+') | |
3693 | |
3694 /* The UTF-7 encoder treats ASCII characters differently according to | |
3695 * whether they are Set D, Set O, Whitespace, or special (i.e. none of | |
3696 * the above). See RFC2152. This array identifies these different | |
3697 * sets: | |
3698 * 0 : "Set D" | |
3699 * alphanumeric and '(),-./:? | |
3700 * 1 : "Set O" | |
3701 * !"#$%&*;<=>@[]^_`{|} | |
3702 * 2 : "whitespace" | |
3703 * ht nl cr sp | |
3704 * 3 : special (must be base64 encoded) | |
3705 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) | |
3706 */ | |
3707 | |
3708 static | |
3709 char utf7_category[128] = { | |
3710 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ | |
3711 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, | |
3712 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ | |
3713 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
3714 /* sp ! " # $ % & ' ( ) * + , - . / */ | |
3715 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, | |
3716 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ | |
3717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, | |
3718 /* @ A B C D E F G H I J K L M N O */ | |
3719 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
3720 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ | |
3721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, | |
3722 /* ` a b c d e f g h i j k l m n o */ | |
3723 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
3724 /* p q r s t u v w x y z { | } ~ del */ | |
3725 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, | |
3726 }; | |
3727 | |
3728 /* ENCODE_DIRECT: this character should be encoded as itself. The | |
3729 * answer depends on whether we are encoding set O as itself, and also | |
3730 * on whether we are encoding whitespace as itself. RFC2152 makes it | |
3731 * clear that the answers to these questions vary between | |
3732 * applications, so this code needs to be flexible. */ | |
3733 | |
3734 #define ENCODE_DIRECT(c, directO, directWS) \ | |
3735 ((c) < 128 && (c) > 0 && \ | |
3736 ((utf7_category[(c)] == 0) || \ | |
3737 (directWS && (utf7_category[(c)] == 2)) || \ | |
3738 (directO && (utf7_category[(c)] == 1)))) | |
3739 | |
3740 PyObject * | |
3741 PyUnicode_DecodeUTF7(const char *s, | |
3742 Py_ssize_t size, | |
3743 const char *errors) | |
3744 { | |
3745 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); | |
3746 } | |
3747 | |
3748 /* The decoder. The only state we preserve is our read position, | |
3749 * i.e. how many characters we have consumed. So if we end in the | |
3750 * middle of a shift sequence we have to back off the read position | |
3751 * and the output to the beginning of the sequence, otherwise we lose | |
3752 * all the shift state (seen bits, number of bits seen, high | |
3753 * surrogate). */ | |
3754 | |
3755 PyObject * | |
3756 PyUnicode_DecodeUTF7Stateful(const char *s, | |
3757 Py_ssize_t size, | |
3758 const char *errors, | |
3759 Py_ssize_t *consumed) | |
3760 { | |
3761 const char *starts = s; | |
3762 Py_ssize_t startinpos; | |
3763 Py_ssize_t endinpos; | |
3764 Py_ssize_t outpos; | |
3765 const char *e; | |
3766 PyObject *unicode; | |
3767 const char *errmsg = ""; | |
3768 int inShift = 0; | |
3769 Py_ssize_t shiftOutStart; | |
3770 unsigned int base64bits = 0; | |
3771 unsigned long base64buffer = 0; | |
3772 Py_UCS4 surrogate = 0; | |
3773 PyObject *errorHandler = NULL; | |
3774 PyObject *exc = NULL; | |
3775 | |
3776 /* Start off assuming it's all ASCII. Widen later as necessary. */ | |
3777 unicode = PyUnicode_New(size, 127); | |
3778 if (!unicode) | |
3779 return NULL; | |
3780 if (size == 0) { | |
3781 if (consumed) | |
3782 *consumed = 0; | |
3783 return unicode; | |
3784 } | |
3785 | |
3786 shiftOutStart = outpos = 0; | |
3787 e = s + size; | |
3788 | |
3789 while (s < e) { | |
3790 Py_UCS4 ch; | |
3791 restart: | |
3792 ch = (unsigned char) *s; | |
3793 | |
3794 if (inShift) { /* in a base-64 section */ | |
3795 if (IS_BASE64(ch)) { /* consume a base-64 character */ | |
3796 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); | |
3797 base64bits += 6; | |
3798 s++; | |
3799 if (base64bits >= 16) { | |
3800 /* we have enough bits for a UTF-16 value */ | |
3801 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); | |
3802 base64bits -= 16; | |
3803 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ | |
3804 if (surrogate) { | |
3805 /* expecting a second surrogate */ | |
3806 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { | |
3807 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); | |
3808 if (unicode_putchar(&unicode, &outpos, ch2) < 0) | |
3809 goto onError; | |
3810 surrogate = 0; | |
3811 continue; | |
3812 } | |
3813 else { | |
3814 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) | |
3815 goto onError; | |
3816 surrogate = 0; | |
3817 } | |
3818 } | |
3819 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { | |
3820 /* first surrogate */ | |
3821 surrogate = outCh; | |
3822 } | |
3823 else { | |
3824 if (unicode_putchar(&unicode, &outpos, outCh) < 0) | |
3825 goto onError; | |
3826 } | |
3827 } | |
3828 } | |
3829 else { /* now leaving a base-64 section */ | |
3830 inShift = 0; | |
3831 s++; | |
3832 if (surrogate) { | |
3833 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) | |
3834 goto onError; | |
3835 surrogate = 0; | |
3836 } | |
3837 if (base64bits > 0) { /* left-over bits */ | |
3838 if (base64bits >= 6) { | |
3839 /* We've seen at least one base-64 character */ | |
3840 errmsg = "partial character in shift sequence"; | |
3841 goto utf7Error; | |
3842 } | |
3843 else { | |
3844 /* Some bits remain; they should be zero */ | |
3845 if (base64buffer != 0) { | |
3846 errmsg = "non-zero padding bits in shift sequence"; | |
3847 goto utf7Error; | |
3848 } | |
3849 } | |
3850 } | |
3851 if (ch != '-') { | |
3852 /* '-' is absorbed; other terminating | |
3853 characters are preserved */ | |
3854 if (unicode_putchar(&unicode, &outpos, ch) < 0) | |
3855 goto onError; | |
3856 } | |
3857 } | |
3858 } | |
3859 else if ( ch == '+' ) { | |
3860 startinpos = s-starts; | |
3861 s++; /* consume '+' */ | |
3862 if (s < e && *s == '-') { /* '+-' encodes '+' */ | |
3863 s++; | |
3864 if (unicode_putchar(&unicode, &outpos, '+') < 0) | |
3865 goto onError; | |
3866 } | |
3867 else { /* begin base64-encoded section */ | |
3868 inShift = 1; | |
3869 shiftOutStart = outpos; | |
3870 base64bits = 0; | |
3871 } | |
3872 } | |
3873 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ | |
3874 if (unicode_putchar(&unicode, &outpos, ch) < 0) | |
3875 goto onError; | |
3876 s++; | |
3877 } | |
3878 else { | |
3879 startinpos = s-starts; | |
3880 s++; | |
3881 errmsg = "unexpected special character"; | |
3882 goto utf7Error; | |
3883 } | |
3884 continue; | |
3885 utf7Error: | |
3886 endinpos = s-starts; | |
3887 if (unicode_decode_call_errorhandler( | |
3888 errors, &errorHandler, | |
3889 "utf7", errmsg, | |
3890 &starts, &e, &startinpos, &endinpos, &exc, &s, | |
3891 &unicode, &outpos)) | |
3892 goto onError; | |
3893 } | |
3894 | |
3895 /* end of string */ | |
3896 | |
3897 if (inShift && !consumed) { /* in shift sequence, no more to follow */ | |
3898 /* if we're in an inconsistent state, that's an error */ | |
3899 if (surrogate || | |
3900 (base64bits >= 6) || | |
3901 (base64bits > 0 && base64buffer != 0)) { | |
3902 endinpos = size; | |
3903 if (unicode_decode_call_errorhandler( | |
3904 errors, &errorHandler, | |
3905 "utf7", "unterminated shift sequence", | |
3906 &starts, &e, &startinpos, &endinpos, &exc, &s, | |
3907 &unicode, &outpos)) | |
3908 goto onError; | |
3909 if (s < e) | |
3910 goto restart; | |
3911 } | |
3912 } | |
3913 | |
3914 /* return state */ | |
3915 if (consumed) { | |
3916 if (inShift) { | |
3917 outpos = shiftOutStart; /* back off output */ | |
3918 *consumed = startinpos; | |
3919 } | |
3920 else { | |
3921 *consumed = s-starts; | |
3922 } | |
3923 } | |
3924 | |
3925 if (unicode_resize(&unicode, outpos) < 0) | |
3926 goto onError; | |
3927 | |
3928 Py_XDECREF(errorHandler); | |
3929 Py_XDECREF(exc); | |
3930 return unicode_result(unicode); | |
3931 | |
3932 onError: | |
3933 Py_XDECREF(errorHandler); | |
3934 Py_XDECREF(exc); | |
3935 Py_DECREF(unicode); | |
3936 return NULL; | |
3937 } | |
3938 | |
3939 | |
3940 PyObject * | |
3941 _PyUnicode_EncodeUTF7(PyObject *str, | |
3942 int base64SetO, | |
3943 int base64WhiteSpace, | |
3944 const char *errors) | |
3945 { | |
3946 int kind; | |
3947 void *data; | |
3948 Py_ssize_t len; | |
3949 PyObject *v; | |
3950 Py_ssize_t allocated; | |
3951 int inShift = 0; | |
3952 Py_ssize_t i; | |
3953 unsigned int base64bits = 0; | |
3954 unsigned long base64buffer = 0; | |
3955 char * out; | |
3956 char * start; | |
3957 | |
3958 if (PyUnicode_READY(str) < 0) | |
3959 return NULL; | |
3960 kind = PyUnicode_KIND(str); | |
3961 data = PyUnicode_DATA(str); | |
3962 len = PyUnicode_GET_LENGTH(str); | |
3963 | |
3964 if (len == 0) | |
3965 return PyBytes_FromStringAndSize(NULL, 0); | |
3966 | |
3967 /* It might be possible to tighten this worst case */ | |
3968 allocated = 8 * len; | |
3969 if (allocated / 8 != len) | |
3970 return PyErr_NoMemory(); | |
3971 | |
3972 v = PyBytes_FromStringAndSize(NULL, allocated); | |
3973 if (v == NULL) | |
3974 return NULL; | |
3975 | |
3976 start = out = PyBytes_AS_STRING(v); | |
3977 for (i = 0; i < len; ++i) { | |
3978 Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
3979 | |
3980 if (inShift) { | |
3981 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { | |
3982 /* shifting out */ | |
3983 if (base64bits) { /* output remaining bits */ | |
3984 *out++ = TO_BASE64(base64buffer << (6-base64bits)); | |
3985 base64buffer = 0; | |
3986 base64bits = 0; | |
3987 } | |
3988 inShift = 0; | |
3989 /* Characters not in the BASE64 set implicitly unshift the sequence | |
3990 so no '-' is required, except if the character is itself a '-' */ | |
3991 if (IS_BASE64(ch) || ch == '-') { | |
3992 *out++ = '-'; | |
3993 } | |
3994 *out++ = (char) ch; | |
3995 } | |
3996 else { | |
3997 goto encode_char; | |
3998 } | |
3999 } | |
4000 else { /* not in a shift sequence */ | |
4001 if (ch == '+') { | |
4002 *out++ = '+'; | |
4003 *out++ = '-'; | |
4004 } | |
4005 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { | |
4006 *out++ = (char) ch; | |
4007 } | |
4008 else { | |
4009 *out++ = '+'; | |
4010 inShift = 1; | |
4011 goto encode_char; | |
4012 } | |
4013 } | |
4014 continue; | |
4015 encode_char: | |
4016 if (ch >= 0x10000) { | |
4017 assert(ch <= MAX_UNICODE); | |
4018 | |
4019 /* code first surrogate */ | |
4020 base64bits += 16; | |
4021 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); | |
4022 while (base64bits >= 6) { | |
4023 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); | |
4024 base64bits -= 6; | |
4025 } | |
4026 /* prepare second surrogate */ | |
4027 ch = Py_UNICODE_LOW_SURROGATE(ch); | |
4028 } | |
4029 base64bits += 16; | |
4030 base64buffer = (base64buffer << 16) | ch; | |
4031 while (base64bits >= 6) { | |
4032 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); | |
4033 base64bits -= 6; | |
4034 } | |
4035 } | |
4036 if (base64bits) | |
4037 *out++= TO_BASE64(base64buffer << (6-base64bits) ); | |
4038 if (inShift) | |
4039 *out++ = '-'; | |
4040 if (_PyBytes_Resize(&v, out - start) < 0) | |
4041 return NULL; | |
4042 return v; | |
4043 } | |
4044 PyObject * | |
4045 PyUnicode_EncodeUTF7(const Py_UNICODE *s, | |
4046 Py_ssize_t size, | |
4047 int base64SetO, | |
4048 int base64WhiteSpace, | |
4049 const char *errors) | |
4050 { | |
4051 PyObject *result; | |
4052 PyObject *tmp = PyUnicode_FromUnicode(s, size); | |
4053 if (tmp == NULL) | |
4054 return NULL; | |
4055 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, | |
4056 base64WhiteSpace, errors); | |
4057 Py_DECREF(tmp); | |
4058 return result; | |
4059 } | |
4060 | |
4061 #undef IS_BASE64 | |
4062 #undef FROM_BASE64 | |
4063 #undef TO_BASE64 | |
4064 #undef DECODE_DIRECT | |
4065 #undef ENCODE_DIRECT | |
4066 | |
4067 /* --- UTF-8 Codec -------------------------------------------------------- */ | |
4068 | |
4069 static | |
4070 char utf8_code_length[256] = { | |
4071 /* Map UTF-8 encoded prefix byte to sequence length. Zero means | |
4072 illegal prefix. See RFC 3629 for details */ | |
4073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ | |
4074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
4075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
4076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
4077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
4078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
4079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
4080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ | |
4081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ | |
4082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
4083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
4084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ | |
4085 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ | |
4086 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ | |
4087 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ | |
4088 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ | |
4089 }; | |
4090 | |
4091 PyObject * | |
4092 PyUnicode_DecodeUTF8(const char *s, | |
4093 Py_ssize_t size, | |
4094 const char *errors) | |
4095 { | |
4096 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); | |
4097 } | |
4098 | |
4099 #include "stringlib/ucs1lib.h" | |
4100 #include "stringlib/codecs.h" | |
4101 #include "stringlib/undef.h" | |
4102 | |
4103 #include "stringlib/ucs2lib.h" | |
4104 #include "stringlib/codecs.h" | |
4105 #include "stringlib/undef.h" | |
4106 | |
4107 #include "stringlib/ucs4lib.h" | |
4108 #include "stringlib/codecs.h" | |
4109 #include "stringlib/undef.h" | |
4110 | |
4111 /* Mask to check or force alignment of a pointer to C 'long' boundaries */ | |
4112 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) | |
4113 | |
4114 /* Mask to quickly check whether a C 'long' contains a | |
4115 non-ASCII, UTF8-encoded char. */ | |
4116 #if (SIZEOF_LONG == 8) | |
4117 # define ASCII_CHAR_MASK 0x8080808080808080L | |
4118 #elif (SIZEOF_LONG == 4) | |
4119 # define ASCII_CHAR_MASK 0x80808080L | |
4120 #else | |
4121 # error C 'long' size should be either 4 or 8! | |
4122 #endif | |
4123 | |
4124 /* Scans a UTF-8 string and returns the maximum character to be expected | |
4125 and the size of the decoded unicode string. | |
4126 | |
4127 This function doesn't check for errors, these checks are performed in | |
4128 PyUnicode_DecodeUTF8Stateful. | |
4129 */ | |
4130 static Py_UCS4 | |
4131 utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size, | |
4132 Py_ssize_t *unicode_size) | |
4133 { | |
4134 Py_ssize_t char_count = 0; | |
4135 const unsigned char *p = (const unsigned char *)s; | |
4136 const unsigned char *end = p + string_size; | |
4137 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); | |
4138 | |
4139 assert(unicode_size != NULL); | |
4140 | |
4141 /* By having a cascade of independent loops which fallback onto each | |
4142 other, we minimize the amount of work done in the average loop | |
4143 iteration, and we also maximize the CPU's ability to predict | |
4144 branches correctly (because a given condition will have always the | |
4145 same boolean outcome except perhaps in the last iteration of the | |
4146 corresponding loop). | |
4147 In the general case this brings us rather close to decoding | |
4148 performance pre-PEP 393, despite the two-pass decoding. | |
4149 | |
4150 Note that the pure ASCII loop is not duplicated once a non-ASCII | |
4151 character has been encountered. It is actually a pessimization (by | |
4152 a significant factor) to use this loop on text with many non-ASCII | |
4153 characters, and it is important to avoid bad performance on valid | |
4154 utf-8 data (invalid utf-8 being a different can of worms). | |
4155 */ | |
4156 | |
4157 /* ASCII */ | |
4158 for (; p < end; ++p) { | |
4159 /* Only check value if it's not a ASCII char... */ | |
4160 if (*p < 0x80) { | |
4161 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for | |
4162 an explanation. */ | |
4163 if (!((size_t) p & LONG_PTR_MASK)) { | |
4164 /* Help register allocation */ | |
4165 register const unsigned char *_p = p; | |
4166 while (_p < aligned_end) { | |
4167 unsigned long value = *(unsigned long *) _p; | |
4168 if (value & ASCII_CHAR_MASK) | |
4169 break; | |
4170 _p += SIZEOF_LONG; | |
4171 char_count += SIZEOF_LONG; | |
4172 } | |
4173 p = _p; | |
4174 if (p == end) | |
4175 break; | |
4176 } | |
4177 } | |
4178 if (*p < 0x80) | |
4179 ++char_count; | |
4180 else | |
4181 goto _ucs1loop; | |
4182 } | |
4183 *unicode_size = char_count; | |
4184 return 127; | |
4185 | |
4186 _ucs1loop: | |
4187 for (; p < end; ++p) { | |
4188 if (*p < 0xc4) | |
4189 char_count += ((*p & 0xc0) != 0x80); | |
4190 else | |
4191 goto _ucs2loop; | |
4192 } | |
4193 *unicode_size = char_count; | |
4194 return 255; | |
4195 | |
4196 _ucs2loop: | |
4197 for (; p < end; ++p) { | |
4198 if (*p < 0xf0) | |
4199 char_count += ((*p & 0xc0) != 0x80); | |
4200 else | |
4201 goto _ucs4loop; | |
4202 } | |
4203 *unicode_size = char_count; | |
4204 return 65535; | |
4205 | |
4206 _ucs4loop: | |
4207 for (; p < end; ++p) { | |
4208 char_count += ((*p & 0xc0) != 0x80); | |
4209 } | |
4210 *unicode_size = char_count; | |
4211 return 65537; | |
4212 } | |
4213 | |
4214 /* Called when we encountered some error that wasn't detected in the original | |
4215 scan, e.g. an encoded surrogate character. The original maxchar computation | |
4216 may have been incorrect, so redo it. */ | |
4217 static int | |
4218 refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n) | |
4219 { | |
4220 PyObject *tmp; | |
4221 Py_ssize_t k; | |
4222 Py_UCS4 maxchar; | |
4223 for (k = 0, maxchar = 0; k < n; k++) | |
4224 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k)); | |
4225 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar); | |
4226 if (tmp == NULL) | |
4227 return -1; | |
4228 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n); | |
4229 Py_DECREF(*unicode); | |
4230 *unicode = tmp; | |
4231 return 0; | |
4232 } | |
4233 | |
4234 /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string | |
4235 in case of errors. Implicit parameters: unicode, kind, data, has_errors, | |
4236 onError. Potential resizing overallocates, so the result needs to shrink | |
4237 at the end. | |
4238 */ | |
4239 #define WRITE_MAYBE_FAIL(index, value) \ | |
4240 do { \ | |
4241 if (has_errors) { \ | |
4242 Py_ssize_t pos = index; \ | |
4243 if (pos > PyUnicode_GET_LENGTH(unicode) && \ | |
4244 unicode_resize(&unicode, pos + pos/8) < 0) \ | |
4245 goto onError; \ | |
4246 if (unicode_putchar(&unicode, &pos, value) < 0) \ | |
4247 goto onError; \ | |
4248 } \ | |
4249 else \ | |
4250 PyUnicode_WRITE(kind, data, index, value); \ | |
4251 } while (0) | |
4252 | |
4253 PyObject * | |
4254 PyUnicode_DecodeUTF8Stateful(const char *s, | |
4255 Py_ssize_t size, | |
4256 const char *errors, | |
4257 Py_ssize_t *consumed) | |
4258 { | |
4259 const char *starts = s; | |
4260 int n; | |
4261 int k; | |
4262 Py_ssize_t startinpos; | |
4263 Py_ssize_t endinpos; | |
4264 const char *e, *aligned_end; | |
4265 PyObject *unicode; | |
4266 const char *errmsg = ""; | |
4267 PyObject *errorHandler = NULL; | |
4268 PyObject *exc = NULL; | |
4269 Py_UCS4 maxchar = 0; | |
4270 Py_ssize_t unicode_size; | |
4271 Py_ssize_t i; | |
4272 int kind; | |
4273 void *data; | |
4274 int has_errors = 0; | |
4275 | |
4276 if (size == 0) { | |
4277 if (consumed) | |
4278 *consumed = 0; | |
4279 return (PyObject *)PyUnicode_New(0, 0); | |
4280 } | |
4281 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size); | |
4282 /* When the string is ASCII only, just use memcpy and return. | |
4283 unicode_size may be != size if there is an incomplete UTF-8 | |
4284 sequence at the end of the ASCII block. */ | |
4285 if (maxchar < 128 && size == unicode_size) { | |
4286 if (consumed) | |
4287 *consumed = size; | |
4288 | |
4289 if (size == 1) | |
4290 return get_latin1_char((unsigned char)s[0]); | |
4291 | |
4292 unicode = PyUnicode_New(unicode_size, maxchar); | |
4293 if (!unicode) | |
4294 return NULL; | |
4295 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); | |
4296 assert(_PyUnicode_CheckConsistency(unicode, 1)); | |
4297 return unicode; | |
4298 } | |
4299 | |
4300 /* In case of errors, maxchar and size computation might be incorrect; | |
4301 code below refits and resizes as necessary. */ | |
4302 unicode = PyUnicode_New(unicode_size, maxchar); | |
4303 if (!unicode) | |
4304 return NULL; | |
4305 kind = PyUnicode_KIND(unicode); | |
4306 data = PyUnicode_DATA(unicode); | |
4307 | |
4308 /* Unpack UTF-8 encoded data */ | |
4309 i = 0; | |
4310 e = s + size; | |
4311 switch (kind) { | |
4312 case PyUnicode_1BYTE_KIND: | |
4313 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i); | |
4314 break; | |
4315 case PyUnicode_2BYTE_KIND: | |
4316 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i); | |
4317 break; | |
4318 case PyUnicode_4BYTE_KIND: | |
4319 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i); | |
4320 break; | |
4321 } | |
4322 if (!has_errors) { | |
4323 /* Ensure the unicode size calculation was correct */ | |
4324 assert(i == unicode_size); | |
4325 assert(s == e); | |
4326 if (consumed) | |
4327 *consumed = s-starts; | |
4328 return unicode; | |
4329 } | |
4330 /* Fall through to the generic decoding loop for the rest of | |
4331 the string */ | |
4332 if (refit_partial_string(&unicode, kind, data, i) < 0) | |
4333 goto onError; | |
4334 | |
4335 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); | |
4336 | |
4337 while (s < e) { | |
4338 Py_UCS4 ch = (unsigned char)*s; | |
4339 | |
4340 if (ch < 0x80) { | |
4341 /* Fast path for runs of ASCII characters. Given that common UTF-8 | |
4342 input will consist of an overwhelming majority of ASCII | |
4343 characters, we try to optimize for this case by checking | |
4344 as many characters as a C 'long' can contain. | |
4345 First, check if we can do an aligned read, as most CPUs have | |
4346 a penalty for unaligned reads. | |
4347 */ | |
4348 if (!((size_t) s & LONG_PTR_MASK)) { | |
4349 /* Help register allocation */ | |
4350 register const char *_s = s; | |
4351 register Py_ssize_t _i = i; | |
4352 while (_s < aligned_end) { | |
4353 /* Read a whole long at a time (either 4 or 8 bytes), | |
4354 and do a fast unrolled copy if it only contains ASCII | |
4355 characters. */ | |
4356 unsigned long value = *(unsigned long *) _s; | |
4357 if (value & ASCII_CHAR_MASK) | |
4358 break; | |
4359 WRITE_MAYBE_FAIL(_i+0, _s[0]); | |
4360 WRITE_MAYBE_FAIL(_i+1, _s[1]); | |
4361 WRITE_MAYBE_FAIL(_i+2, _s[2]); | |
4362 WRITE_MAYBE_FAIL(_i+3, _s[3]); | |
4363 #if (SIZEOF_LONG == 8) | |
4364 WRITE_MAYBE_FAIL(_i+4, _s[4]); | |
4365 WRITE_MAYBE_FAIL(_i+5, _s[5]); | |
4366 WRITE_MAYBE_FAIL(_i+6, _s[6]); | |
4367 WRITE_MAYBE_FAIL(_i+7, _s[7]); | |
4368 #endif | |
4369 _s += SIZEOF_LONG; | |
4370 _i += SIZEOF_LONG; | |
4371 } | |
4372 s = _s; | |
4373 i = _i; | |
4374 if (s == e) | |
4375 break; | |
4376 ch = (unsigned char)*s; | |
4377 } | |
4378 } | |
4379 | |
4380 if (ch < 0x80) { | |
4381 WRITE_MAYBE_FAIL(i++, ch); | |
4382 s++; | |
4383 continue; | |
4384 } | |
4385 | |
4386 n = utf8_code_length[ch]; | |
4387 | |
4388 if (s + n > e) { | |
4389 if (consumed) | |
4390 break; | |
4391 else { | |
4392 errmsg = "unexpected end of data"; | |
4393 startinpos = s-starts; | |
4394 endinpos = startinpos+1; | |
4395 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) | |
4396 endinpos++; | |
4397 goto utf8Error; | |
4398 } | |
4399 } | |
4400 | |
4401 switch (n) { | |
4402 | |
4403 case 0: | |
4404 errmsg = "invalid start byte"; | |
4405 startinpos = s-starts; | |
4406 endinpos = startinpos+1; | |
4407 goto utf8Error; | |
4408 | |
4409 case 1: | |
4410 errmsg = "internal error"; | |
4411 startinpos = s-starts; | |
4412 endinpos = startinpos+1; | |
4413 goto utf8Error; | |
4414 | |
4415 case 2: | |
4416 if ((s[1] & 0xc0) != 0x80) { | |
4417 errmsg = "invalid continuation byte"; | |
4418 startinpos = s-starts; | |
4419 endinpos = startinpos + 1; | |
4420 goto utf8Error; | |
4421 } | |
4422 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); | |
4423 assert ((ch > 0x007F) && (ch <= 0x07FF)); | |
4424 WRITE_MAYBE_FAIL(i++, ch); | |
4425 break; | |
4426 | |
4427 case 3: | |
4428 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf | |
4429 will result in surrogates in range d800-dfff. Surrogates are | |
4430 not valid UTF-8 so they are rejected. | |
4431 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | |
4432 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | |
4433 if ((s[1] & 0xc0) != 0x80 || | |
4434 (s[2] & 0xc0) != 0x80 || | |
4435 ((unsigned char)s[0] == 0xE0 && | |
4436 (unsigned char)s[1] < 0xA0) || | |
4437 ((unsigned char)s[0] == 0xED && | |
4438 (unsigned char)s[1] > 0x9F)) { | |
4439 errmsg = "invalid continuation byte"; | |
4440 startinpos = s-starts; | |
4441 endinpos = startinpos + 1; | |
4442 | |
4443 /* if s[1] first two bits are 1 and 0, then the invalid | |
4444 continuation byte is s[2], so increment endinpos by 1, | |
4445 if not, s[1] is invalid and endinpos doesn't need to | |
4446 be incremented. */ | |
4447 if ((s[1] & 0xC0) == 0x80) | |
4448 endinpos++; | |
4449 goto utf8Error; | |
4450 } | |
4451 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); | |
4452 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | |
4453 WRITE_MAYBE_FAIL(i++, ch); | |
4454 break; | |
4455 | |
4456 case 4: | |
4457 if ((s[1] & 0xc0) != 0x80 || | |
4458 (s[2] & 0xc0) != 0x80 || | |
4459 (s[3] & 0xc0) != 0x80 || | |
4460 ((unsigned char)s[0] == 0xF0 && | |
4461 (unsigned char)s[1] < 0x90) || | |
4462 ((unsigned char)s[0] == 0xF4 && | |
4463 (unsigned char)s[1] > 0x8F)) { | |
4464 errmsg = "invalid continuation byte"; | |
4465 startinpos = s-starts; | |
4466 endinpos = startinpos + 1; | |
4467 if ((s[1] & 0xC0) == 0x80) { | |
4468 endinpos++; | |
4469 if ((s[2] & 0xC0) == 0x80) | |
4470 endinpos++; | |
4471 } | |
4472 goto utf8Error; | |
4473 } | |
4474 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + | |
4475 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); | |
4476 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE)); | |
4477 | |
4478 WRITE_MAYBE_FAIL(i++, ch); | |
4479 break; | |
4480 } | |
4481 s += n; | |
4482 continue; | |
4483 | |
4484 utf8Error: | |
4485 if (!has_errors) { | |
4486 if (refit_partial_string(&unicode, kind, data, i) < 0) | |
4487 goto onError; | |
4488 has_errors = 1; | |
4489 } | |
4490 if (unicode_decode_call_errorhandler( | |
4491 errors, &errorHandler, | |
4492 "utf8", errmsg, | |
4493 &starts, &e, &startinpos, &endinpos, &exc, &s, | |
4494 &unicode, &i)) | |
4495 goto onError; | |
4496 /* Update data because unicode_decode_call_errorhandler might have | |
4497 re-created or resized the unicode object. */ | |
4498 data = PyUnicode_DATA(unicode); | |
4499 kind = PyUnicode_KIND(unicode); | |
4500 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); | |
4501 } | |
4502 /* Ensure the unicode_size calculation above was correct: */ | |
4503 assert(has_errors || i == unicode_size); | |
4504 | |
4505 if (consumed) | |
4506 *consumed = s-starts; | |
4507 | |
4508 /* Adjust length and ready string when it contained errors and | |
4509 is of the old resizable kind. */ | |
4510 if (has_errors) { | |
4511 if (PyUnicode_Resize(&unicode, i) < 0) | |
4512 goto onError; | |
4513 } | |
4514 | |
4515 Py_XDECREF(errorHandler); | |
4516 Py_XDECREF(exc); | |
4517 assert(_PyUnicode_CheckConsistency(unicode, 1)); | |
4518 return unicode; | |
4519 | |
4520 onError: | |
4521 Py_XDECREF(errorHandler); | |
4522 Py_XDECREF(exc); | |
4523 Py_DECREF(unicode); | |
4524 return NULL; | |
4525 } | |
4526 | |
4527 #undef WRITE_MAYBE_FAIL | |
4528 | |
4529 #ifdef __APPLE__ | |
4530 | |
4531 /* Simplified UTF-8 decoder using surrogateescape error handler, | |
4532 used to decode the command line arguments on Mac OS X. */ | |
4533 | |
4534 wchar_t* | |
4535 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) | |
4536 { | |
4537 int n; | |
4538 const char *e; | |
4539 wchar_t *unicode, *p; | |
4540 | |
4541 /* Note: size will always be longer than the resulting Unicode | |
4542 character count */ | |
4543 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { | |
4544 PyErr_NoMemory(); | |
4545 return NULL; | |
4546 } | |
4547 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); | |
4548 if (!unicode) | |
4549 return NULL; | |
4550 | |
4551 /* Unpack UTF-8 encoded data */ | |
4552 p = unicode; | |
4553 e = s + size; | |
4554 while (s < e) { | |
4555 Py_UCS4 ch = (unsigned char)*s; | |
4556 | |
4557 if (ch < 0x80) { | |
4558 *p++ = (wchar_t)ch; | |
4559 s++; | |
4560 continue; | |
4561 } | |
4562 | |
4563 n = utf8_code_length[ch]; | |
4564 if (s + n > e) { | |
4565 goto surrogateescape; | |
4566 } | |
4567 | |
4568 switch (n) { | |
4569 case 0: | |
4570 case 1: | |
4571 goto surrogateescape; | |
4572 | |
4573 case 2: | |
4574 if ((s[1] & 0xc0) != 0x80) | |
4575 goto surrogateescape; | |
4576 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); | |
4577 assert ((ch > 0x007F) && (ch <= 0x07FF)); | |
4578 *p++ = (wchar_t)ch; | |
4579 break; | |
4580 | |
4581 case 3: | |
4582 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf | |
4583 will result in surrogates in range d800-dfff. Surrogates are | |
4584 not valid UTF-8 so they are rejected. | |
4585 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | |
4586 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | |
4587 if ((s[1] & 0xc0) != 0x80 || | |
4588 (s[2] & 0xc0) != 0x80 || | |
4589 ((unsigned char)s[0] == 0xE0 && | |
4590 (unsigned char)s[1] < 0xA0) || | |
4591 ((unsigned char)s[0] == 0xED && | |
4592 (unsigned char)s[1] > 0x9F)) { | |
4593 | |
4594 goto surrogateescape; | |
4595 } | |
4596 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); | |
4597 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | |
4598 *p++ = (wchar_t)ch; | |
4599 break; | |
4600 | |
4601 case 4: | |
4602 if ((s[1] & 0xc0) != 0x80 || | |
4603 (s[2] & 0xc0) != 0x80 || | |
4604 (s[3] & 0xc0) != 0x80 || | |
4605 ((unsigned char)s[0] == 0xF0 && | |
4606 (unsigned char)s[1] < 0x90) || | |
4607 ((unsigned char)s[0] == 0xF4 && | |
4608 (unsigned char)s[1] > 0x8F)) { | |
4609 goto surrogateescape; | |
4610 } | |
4611 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + | |
4612 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); | |
4613 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE)); | |
4614 | |
4615 #if SIZEOF_WCHAR_T == 4 | |
4616 *p++ = (wchar_t)ch; | |
4617 #else | |
4618 /* compute and append the two surrogates: */ | |
4619 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); | |
4620 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); | |
4621 #endif | |
4622 break; | |
4623 } | |
4624 s += n; | |
4625 continue; | |
4626 | |
4627 surrogateescape: | |
4628 *p++ = 0xDC00 + ch; | |
4629 s++; | |
4630 } | |
4631 *p = L'\0'; | |
4632 return unicode; | |
4633 } | |
4634 | |
4635 #endif /* __APPLE__ */ | |
4636 | |
4637 /* Primary internal function which creates utf8 encoded bytes objects. | |
4638 | |
4639 Allocation strategy: if the string is short, convert into a stack buffer | |
4640 and allocate exactly as much space needed at the end. Else allocate the | |
4641 maximum possible needed (4 result bytes per Unicode character), and return | |
4642 the excess memory at the end. | |
4643 */ | |
4644 PyObject * | |
4645 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) | |
4646 { | |
4647 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ | |
4648 | |
4649 Py_ssize_t i; /* index into s of next input byte */ | |
4650 PyObject *result; /* result string object */ | |
4651 char *p; /* next free byte in output buffer */ | |
4652 Py_ssize_t nallocated; /* number of result bytes allocated */ | |
4653 Py_ssize_t nneeded; /* number of result bytes needed */ | |
4654 char stackbuf[MAX_SHORT_UNICHARS * 4]; | |
4655 PyObject *errorHandler = NULL; | |
4656 PyObject *exc = NULL; | |
4657 int kind; | |
4658 void *data; | |
4659 Py_ssize_t size; | |
4660 PyObject *rep = NULL; | |
4661 | |
4662 if (!PyUnicode_Check(unicode)) { | |
4663 PyErr_BadArgument(); | |
4664 return NULL; | |
4665 } | |
4666 | |
4667 if (PyUnicode_READY(unicode) == -1) | |
4668 return NULL; | |
4669 | |
4670 if (PyUnicode_UTF8(unicode)) | |
4671 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), | |
4672 PyUnicode_UTF8_LENGTH(unicode)); | |
4673 | |
4674 kind = PyUnicode_KIND(unicode); | |
4675 data = PyUnicode_DATA(unicode); | |
4676 size = PyUnicode_GET_LENGTH(unicode); | |
4677 | |
4678 assert(size >= 0); | |
4679 | |
4680 if (size <= MAX_SHORT_UNICHARS) { | |
4681 /* Write into the stack buffer; nallocated can't overflow. | |
4682 * At the end, we'll allocate exactly as much heap space as it | |
4683 * turns out we need. | |
4684 */ | |
4685 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); | |
4686 result = NULL; /* will allocate after we're done */ | |
4687 p = stackbuf; | |
4688 } | |
4689 else { | |
4690 /* Overallocate on the heap, and give the excess back at the end. */ | |
4691 nallocated = size * 4; | |
4692 if (nallocated / 4 != size) /* overflow! */ | |
4693 return PyErr_NoMemory(); | |
4694 result = PyBytes_FromStringAndSize(NULL, nallocated); | |
4695 if (result == NULL) | |
4696 return NULL; | |
4697 p = PyBytes_AS_STRING(result); | |
4698 } | |
4699 | |
4700 for (i = 0; i < size;) { | |
4701 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); | |
4702 | |
4703 if (ch < 0x80) | |
4704 /* Encode ASCII */ | |
4705 *p++ = (char) ch; | |
4706 | |
4707 else if (ch < 0x0800) { | |
4708 /* Encode Latin-1 */ | |
4709 *p++ = (char)(0xc0 | (ch >> 6)); | |
4710 *p++ = (char)(0x80 | (ch & 0x3f)); | |
4711 } else if (Py_UNICODE_IS_SURROGATE(ch)) { | |
4712 Py_ssize_t newpos; | |
4713 Py_ssize_t repsize, k, startpos; | |
4714 startpos = i-1; | |
4715 rep = unicode_encode_call_errorhandler( | |
4716 errors, &errorHandler, "utf-8", "surrogates not allowed", | |
4717 unicode, &exc, startpos, startpos+1, &newpos); | |
4718 if (!rep) | |
4719 goto error; | |
4720 | |
4721 if (PyBytes_Check(rep)) | |
4722 repsize = PyBytes_GET_SIZE(rep); | |
4723 else | |
4724 repsize = PyUnicode_GET_LENGTH(rep); | |
4725 | |
4726 if (repsize > 4) { | |
4727 Py_ssize_t offset; | |
4728 | |
4729 if (result == NULL) | |
4730 offset = p - stackbuf; | |
4731 else | |
4732 offset = p - PyBytes_AS_STRING(result); | |
4733 | |
4734 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { | |
4735 /* integer overflow */ | |
4736 PyErr_NoMemory(); | |
4737 goto error; | |
4738 } | |
4739 nallocated += repsize - 4; | |
4740 if (result != NULL) { | |
4741 if (_PyBytes_Resize(&result, nallocated) < 0) | |
4742 goto error; | |
4743 } else { | |
4744 result = PyBytes_FromStringAndSize(NULL, nallocated); | |
4745 if (result == NULL) | |
4746 goto error; | |
4747 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); | |
4748 } | |
4749 p = PyBytes_AS_STRING(result) + offset; | |
4750 } | |
4751 | |
4752 if (PyBytes_Check(rep)) { | |
4753 char *prep = PyBytes_AS_STRING(rep); | |
4754 for(k = repsize; k > 0; k--) | |
4755 *p++ = *prep++; | |
4756 } else /* rep is unicode */ { | |
4757 enum PyUnicode_Kind repkind; | |
4758 void *repdata; | |
4759 | |
4760 if (PyUnicode_READY(rep) < 0) | |
4761 goto error; | |
4762 repkind = PyUnicode_KIND(rep); | |
4763 repdata = PyUnicode_DATA(rep); | |
4764 | |
4765 for(k=0; k<repsize; k++) { | |
4766 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k); | |
4767 if (0x80 <= c) { | |
4768 raise_encode_exception(&exc, "utf-8", | |
4769 unicode, | |
4770 i-1, i, | |
4771 "surrogates not allowed"); | |
4772 goto error; | |
4773 } | |
4774 *p++ = (char)c; | |
4775 } | |
4776 } | |
4777 Py_CLEAR(rep); | |
4778 } else if (ch < 0x10000) { | |
4779 *p++ = (char)(0xe0 | (ch >> 12)); | |
4780 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | |
4781 *p++ = (char)(0x80 | (ch & 0x3f)); | |
4782 } else /* ch >= 0x10000 */ { | |
4783 assert(ch <= MAX_UNICODE); | |
4784 /* Encode UCS4 Unicode ordinals */ | |
4785 *p++ = (char)(0xf0 | (ch >> 18)); | |
4786 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | |
4787 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | |
4788 *p++ = (char)(0x80 | (ch & 0x3f)); | |
4789 } | |
4790 } | |
4791 | |
4792 if (result == NULL) { | |
4793 /* This was stack allocated. */ | |
4794 nneeded = p - stackbuf; | |
4795 assert(nneeded <= nallocated); | |
4796 result = PyBytes_FromStringAndSize(stackbuf, nneeded); | |
4797 } | |
4798 else { | |
4799 /* Cut back to size actually needed. */ | |
4800 nneeded = p - PyBytes_AS_STRING(result); | |
4801 assert(nneeded <= nallocated); | |
4802 _PyBytes_Resize(&result, nneeded); | |
4803 } | |
4804 | |
4805 Py_XDECREF(errorHandler); | |
4806 Py_XDECREF(exc); | |
4807 return result; | |
4808 error: | |
4809 Py_XDECREF(rep); | |
4810 Py_XDECREF(errorHandler); | |
4811 Py_XDECREF(exc); | |
4812 Py_XDECREF(result); | |
4813 return NULL; | |
4814 | |
4815 #undef MAX_SHORT_UNICHARS | |
4816 } | |
4817 | |
4818 PyObject * | |
4819 PyUnicode_EncodeUTF8(const Py_UNICODE *s, | |
4820 Py_ssize_t size, | |
4821 const char *errors) | |
4822 { | |
4823 PyObject *v, *unicode; | |
4824 | |
4825 unicode = PyUnicode_FromUnicode(s, size); | |
4826 if (unicode == NULL) | |
4827 return NULL; | |
4828 v = _PyUnicode_AsUTF8String(unicode, errors); | |
4829 Py_DECREF(unicode); | |
4830 return v; | |
4831 } | |
4832 | |
4833 PyObject * | |
4834 PyUnicode_AsUTF8String(PyObject *unicode) | |
4835 { | |
4836 return _PyUnicode_AsUTF8String(unicode, NULL); | |
4837 } | |
4838 | |
4839 /* --- UTF-32 Codec ------------------------------------------------------- */ | |
4840 | |
4841 PyObject * | |
4842 PyUnicode_DecodeUTF32(const char *s, | |
4843 Py_ssize_t size, | |
4844 const char *errors, | |
4845 int *byteorder) | |
4846 { | |
4847 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); | |
4848 } | |
4849 | |
4850 PyObject * | |
4851 PyUnicode_DecodeUTF32Stateful(const char *s, | |
4852 Py_ssize_t size, | |
4853 const char *errors, | |
4854 int *byteorder, | |
4855 Py_ssize_t *consumed) | |
4856 { | |
4857 const char *starts = s; | |
4858 Py_ssize_t startinpos; | |
4859 Py_ssize_t endinpos; | |
4860 Py_ssize_t outpos; | |
4861 PyObject *unicode; | |
4862 const unsigned char *q, *e; | |
4863 int bo = 0; /* assume native ordering by default */ | |
4864 const char *errmsg = ""; | |
4865 /* Offsets from q for retrieving bytes in the right order. */ | |
4866 #ifdef BYTEORDER_IS_LITTLE_ENDIAN | |
4867 int iorder[] = {0, 1, 2, 3}; | |
4868 #else | |
4869 int iorder[] = {3, 2, 1, 0}; | |
4870 #endif | |
4871 PyObject *errorHandler = NULL; | |
4872 PyObject *exc = NULL; | |
4873 | |
4874 q = (unsigned char *)s; | |
4875 e = q + size; | |
4876 | |
4877 if (byteorder) | |
4878 bo = *byteorder; | |
4879 | |
4880 /* Check for BOM marks (U+FEFF) in the input and adjust current | |
4881 byte order setting accordingly. In native mode, the leading BOM | |
4882 mark is skipped, in all other modes, it is copied to the output | |
4883 stream as-is (giving a ZWNBSP character). */ | |
4884 if (bo == 0) { | |
4885 if (size >= 4) { | |
4886 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | |
4887 (q[iorder[1]] << 8) | q[iorder[0]]; | |
4888 #ifdef BYTEORDER_IS_LITTLE_ENDIAN | |
4889 if (bom == 0x0000FEFF) { | |
4890 q += 4; | |
4891 bo = -1; | |
4892 } | |
4893 else if (bom == 0xFFFE0000) { | |
4894 q += 4; | |
4895 bo = 1; | |
4896 } | |
4897 #else | |
4898 if (bom == 0x0000FEFF) { | |
4899 q += 4; | |
4900 bo = 1; | |
4901 } | |
4902 else if (bom == 0xFFFE0000) { | |
4903 q += 4; | |
4904 bo = -1; | |
4905 } | |
4906 #endif | |
4907 } | |
4908 } | |
4909 | |
4910 if (bo == -1) { | |
4911 /* force LE */ | |
4912 iorder[0] = 0; | |
4913 iorder[1] = 1; | |
4914 iorder[2] = 2; | |
4915 iorder[3] = 3; | |
4916 } | |
4917 else if (bo == 1) { | |
4918 /* force BE */ | |
4919 iorder[0] = 3; | |
4920 iorder[1] = 2; | |
4921 iorder[2] = 1; | |
4922 iorder[3] = 0; | |
4923 } | |
4924 | |
4925 /* This might be one to much, because of a BOM */ | |
4926 unicode = PyUnicode_New((size+3)/4, 127); | |
4927 if (!unicode) | |
4928 return NULL; | |
4929 if (size == 0) | |
4930 return unicode; | |
4931 outpos = 0; | |
4932 | |
4933 while (q < e) { | |
4934 Py_UCS4 ch; | |
4935 /* remaining bytes at the end? (size should be divisible by 4) */ | |
4936 if (e-q<4) { | |
4937 if (consumed) | |
4938 break; | |
4939 errmsg = "truncated data"; | |
4940 startinpos = ((const char *)q)-starts; | |
4941 endinpos = ((const char *)e)-starts; | |
4942 goto utf32Error; | |
4943 /* The remaining input chars are ignored if the callback | |
4944 chooses to skip the input */ | |
4945 } | |
4946 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | |
4947 (q[iorder[1]] << 8) | q[iorder[0]]; | |
4948 | |
4949 if (ch >= 0x110000) | |
4950 { | |
4951 errmsg = "codepoint not in range(0x110000)"; | |
4952 startinpos = ((const char *)q)-starts; | |
4953 endinpos = startinpos+4; | |
4954 goto utf32Error; | |
4955 } | |
4956 if (unicode_putchar(&unicode, &outpos, ch) < 0) | |
4957 goto onError; | |
4958 q += 4; | |
4959 continue; | |
4960 utf32Error: | |
4961 if (unicode_decode_call_errorhandler( | |
4962 errors, &errorHandler, | |
4963 "utf32", errmsg, | |
4964 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, | |
4965 &unicode, &outpos)) | |
4966 goto onError; | |
4967 } | |
4968 | |
4969 if (byteorder) | |
4970 *byteorder = bo; | |
4971 | |
4972 if (consumed) | |
4973 *consumed = (const char *)q-starts; | |
4974 | |
4975 /* Adjust length */ | |
4976 if (PyUnicode_Resize(&unicode, outpos) < 0) | |
4977 goto onError; | |
4978 | |
4979 Py_XDECREF(errorHandler); | |
4980 Py_XDECREF(exc); | |
4981 return unicode_result(unicode); | |
4982 | |
4983 onError: | |
4984 Py_DECREF(unicode); | |
4985 Py_XDECREF(errorHandler); | |
4986 Py_XDECREF(exc); | |
4987 return NULL; | |
4988 } | |
4989 | |
4990 PyObject * | |
4991 _PyUnicode_EncodeUTF32(PyObject *str, | |
4992 const char *errors, | |
4993 int byteorder) | |
4994 { | |
4995 int kind; | |
4996 void *data; | |
4997 Py_ssize_t len; | |
4998 PyObject *v; | |
4999 unsigned char *p; | |
5000 Py_ssize_t nsize, bytesize, i; | |
5001 /* Offsets from p for storing byte pairs in the right order. */ | |
5002 #ifdef BYTEORDER_IS_LITTLE_ENDIAN | |
5003 int iorder[] = {0, 1, 2, 3}; | |
5004 #else | |
5005 int iorder[] = {3, 2, 1, 0}; | |
5006 #endif | |
5007 | |
5008 #define STORECHAR(CH) \ | |
5009 do { \ | |
5010 p[iorder[3]] = ((CH) >> 24) & 0xff; \ | |
5011 p[iorder[2]] = ((CH) >> 16) & 0xff; \ | |
5012 p[iorder[1]] = ((CH) >> 8) & 0xff; \ | |
5013 p[iorder[0]] = (CH) & 0xff; \ | |
5014 p += 4; \ | |
5015 } while(0) | |
5016 | |
5017 if (!PyUnicode_Check(str)) { | |
5018 PyErr_BadArgument(); | |
5019 return NULL; | |
5020 } | |
5021 if (PyUnicode_READY(str) < 0) | |
5022 return NULL; | |
5023 kind = PyUnicode_KIND(str); | |
5024 data = PyUnicode_DATA(str); | |
5025 len = PyUnicode_GET_LENGTH(str); | |
5026 | |
5027 nsize = len + (byteorder == 0); | |
5028 bytesize = nsize * 4; | |
5029 if (bytesize / 4 != nsize) | |
5030 return PyErr_NoMemory(); | |
5031 v = PyBytes_FromStringAndSize(NULL, bytesize); | |
5032 if (v == NULL) | |
5033 return NULL; | |
5034 | |
5035 p = (unsigned char *)PyBytes_AS_STRING(v); | |
5036 if (byteorder == 0) | |
5037 STORECHAR(0xFEFF); | |
5038 if (len == 0) | |
5039 goto done; | |
5040 | |
5041 if (byteorder == -1) { | |
5042 /* force LE */ | |
5043 iorder[0] = 0; | |
5044 iorder[1] = 1; | |
5045 iorder[2] = 2; | |
5046 iorder[3] = 3; | |
5047 } | |
5048 else if (byteorder == 1) { | |
5049 /* force BE */ | |
5050 iorder[0] = 3; | |
5051 iorder[1] = 2; | |
5052 iorder[2] = 1; | |
5053 iorder[3] = 0; | |
5054 } | |
5055 | |
5056 for (i = 0; i < len; i++) | |
5057 STORECHAR(PyUnicode_READ(kind, data, i)); | |
5058 | |
5059 done: | |
5060 return v; | |
5061 #undef STORECHAR | |
5062 } | |
5063 | |
5064 PyObject * | |
5065 PyUnicode_EncodeUTF32(const Py_UNICODE *s, | |
5066 Py_ssize_t size, | |
5067 const char *errors, | |
5068 int byteorder) | |
5069 { | |
5070 PyObject *result; | |
5071 PyObject *tmp = PyUnicode_FromUnicode(s, size); | |
5072 if (tmp == NULL) | |
5073 return NULL; | |
5074 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); | |
5075 Py_DECREF(tmp); | |
5076 return result; | |
5077 } | |
5078 | |
5079 PyObject * | |
5080 PyUnicode_AsUTF32String(PyObject *unicode) | |
5081 { | |
5082 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); | |
5083 } | |
5084 | |
5085 /* --- UTF-16 Codec ------------------------------------------------------- */ | |
5086 | |
5087 PyObject * | |
5088 PyUnicode_DecodeUTF16(const char *s, | |
5089 Py_ssize_t size, | |
5090 const char *errors, | |
5091 int *byteorder) | |
5092 { | |
5093 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); | |
5094 } | |
5095 | |
5096 /* Two masks for fast checking of whether a C 'long' may contain | |
5097 UTF16-encoded surrogate characters. This is an efficient heuristic, | |
5098 assuming that non-surrogate characters with a code point >= 0x8000 are | |
5099 rare in most input. | |
5100 FAST_CHAR_MASK is used when the input is in native byte ordering, | |
5101 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. | |
5102 */ | |
5103 #if (SIZEOF_LONG == 8) | |
5104 # define FAST_CHAR_MASK 0x8000800080008000L | |
5105 # define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L | |
5106 #elif (SIZEOF_LONG == 4) | |
5107 # define FAST_CHAR_MASK 0x80008000L | |
5108 # define SWAPPED_FAST_CHAR_MASK 0x00800080L | |
5109 #else | |
5110 # error C 'long' size should be either 4 or 8! | |
5111 #endif | |
5112 | |
5113 PyObject * | |
5114 PyUnicode_DecodeUTF16Stateful(const char *s, | |
5115 Py_ssize_t size, | |
5116 const char *errors, | |
5117 int *byteorder, | |
5118 Py_ssize_t *consumed) | |
5119 { | |
5120 const char *starts = s; | |
5121 Py_ssize_t startinpos; | |
5122 Py_ssize_t endinpos; | |
5123 Py_ssize_t outpos; | |
5124 PyObject *unicode; | |
5125 const unsigned char *q, *e, *aligned_end; | |
5126 int bo = 0; /* assume native ordering by default */ | |
5127 int native_ordering = 0; | |
5128 const char *errmsg = ""; | |
5129 /* Offsets from q for retrieving byte pairs in the right order. */ | |
5130 #ifdef BYTEORDER_IS_LITTLE_ENDIAN | |
5131 int ihi = 1, ilo = 0; | |
5132 #else | |
5133 int ihi = 0, ilo = 1; | |
5134 #endif | |
5135 PyObject *errorHandler = NULL; | |
5136 PyObject *exc = NULL; | |
5137 | |
5138 /* Note: size will always be longer than the resulting Unicode | |
5139 character count */ | |
5140 unicode = PyUnicode_New(size, 127); | |
5141 if (!unicode) | |
5142 return NULL; | |
5143 if (size == 0) | |
5144 return unicode; | |
5145 outpos = 0; | |
5146 | |
5147 q = (unsigned char *)s; | |
5148 e = q + size - 1; | |
5149 | |
5150 if (byteorder) | |
5151 bo = *byteorder; | |
5152 | |
5153 /* Check for BOM marks (U+FEFF) in the input and adjust current | |
5154 byte order setting accordingly. In native mode, the leading BOM | |
5155 mark is skipped, in all other modes, it is copied to the output | |
5156 stream as-is (giving a ZWNBSP character). */ | |
5157 if (bo == 0) { | |
5158 if (size >= 2) { | |
5159 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo]; | |
5160 #ifdef BYTEORDER_IS_LITTLE_ENDIAN | |
5161 if (bom == 0xFEFF) { | |
5162 q += 2; | |
5163 bo = -1; | |
5164 } | |
5165 else if (bom == 0xFFFE) { | |
5166 q += 2; | |
5167 bo = 1; | |
5168 } | |
5169 #else | |
5170 if (bom == 0xFEFF) { | |
5171 q += 2; | |
5172 bo = 1; | |
5173 } | |
5174 else if (bom == 0xFFFE) { | |
5175 q += 2; | |
5176 bo = -1; | |
5177 } | |
5178 #endif | |
5179 } | |
5180 } | |
5181 | |
5182 if (bo == -1) { | |
5183 /* force LE */ | |
5184 ihi = 1; | |
5185 ilo = 0; | |
5186 } | |
5187 else if (bo == 1) { | |
5188 /* force BE */ | |
5189 ihi = 0; | |
5190 ilo = 1; | |
5191 } | |
5192 #ifdef BYTEORDER_IS_LITTLE_ENDIAN | |
5193 native_ordering = ilo < ihi; | |
5194 #else | |
5195 native_ordering = ilo > ihi; | |
5196 #endif | |
5197 | |
5198 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); | |
5199 while (q < e) { | |
5200 Py_UCS4 ch; | |
5201 /* First check for possible aligned read of a C 'long'. Unaligned | |
5202 reads are more expensive, better to defer to another iteration. */ | |
5203 if (!((size_t) q & LONG_PTR_MASK)) { | |
5204 /* Fast path for runs of non-surrogate chars. */ | |
5205 register const unsigned char *_q = q; | |
5206 int kind = PyUnicode_KIND(unicode); | |
5207 void *data = PyUnicode_DATA(unicode); | |
5208 while (_q < aligned_end) { | |
5209 unsigned long block = * (unsigned long *) _q; | |
5210 unsigned short *pblock = (unsigned short*)█ | |
5211 Py_UCS4 maxch; | |
5212 if (native_ordering) { | |
5213 /* Can use buffer directly */ | |
5214 if (block & FAST_CHAR_MASK) | |
5215 break; | |
5216 } | |
5217 else { | |
5218 /* Need to byte-swap */ | |
5219 unsigned char *_p = (unsigned char*)pblock; | |
5220 if (block & SWAPPED_FAST_CHAR_MASK) | |
5221 break; | |
5222 _p[0] = _q[1]; | |
5223 _p[1] = _q[0]; | |
5224 _p[2] = _q[3]; | |
5225 _p[3] = _q[2]; | |
5226 #if (SIZEOF_LONG == 8) | |
5227 _p[4] = _q[5]; | |
5228 _p[5] = _q[4]; | |
5229 _p[6] = _q[7]; | |
5230 _p[7] = _q[6]; | |
5231 #endif | |
5232 } | |
5233 maxch = Py_MAX(pblock[0], pblock[1]); | |
5234 #if SIZEOF_LONG == 8 | |
5235 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3])); | |
5236 #endif | |
5237 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) { | |
5238 if (unicode_widen(&unicode, maxch) < 0) | |
5239 goto onError; | |
5240 kind = PyUnicode_KIND(unicode); | |
5241 data = PyUnicode_DATA(unicode); | |
5242 } | |
5243 PyUnicode_WRITE(kind, data, outpos++, pblock[0]); | |
5244 PyUnicode_WRITE(kind, data, outpos++, pblock[1]); | |
5245 #if SIZEOF_LONG == 8 | |
5246 PyUnicode_WRITE(kind, data, outpos++, pblock[2]); | |
5247 PyUnicode_WRITE(kind, data, outpos++, pblock[3]); | |
5248 #endif | |
5249 _q += SIZEOF_LONG; | |
5250 } | |
5251 q = _q; | |
5252 if (q >= e) | |
5253 break; | |
5254 } | |
5255 ch = (q[ihi] << 8) | q[ilo]; | |
5256 | |
5257 q += 2; | |
5258 | |
5259 if (!Py_UNICODE_IS_SURROGATE(ch)) { | |
5260 if (unicode_putchar(&unicode, &outpos, ch) < 0) | |
5261 goto onError; | |
5262 continue; | |
5263 } | |
5264 | |
5265 /* UTF-16 code pair: */ | |
5266 if (q > e) { | |
5267 errmsg = "unexpected end of data"; | |
5268 startinpos = (((const char *)q) - 2) - starts; | |
5269 endinpos = ((const char *)e) + 1 - starts; | |
5270 goto utf16Error; | |
5271 } | |
5272 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) { | |
5273 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo]; | |
5274 q += 2; | |
5275 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) { | |
5276 if (unicode_putchar(&unicode, &outpos, | |
5277 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0) | |
5278 goto onError; | |
5279 continue; | |
5280 } | |
5281 else { | |
5282 errmsg = "illegal UTF-16 surrogate"; | |
5283 startinpos = (((const char *)q)-4)-starts; | |
5284 endinpos = startinpos+2; | |
5285 goto utf16Error; | |
5286 } | |
5287 | |
5288 } | |
5289 errmsg = "illegal encoding"; | |
5290 startinpos = (((const char *)q)-2)-starts; | |
5291 endinpos = startinpos+2; | |
5292 /* Fall through to report the error */ | |
5293 | |
5294 utf16Error: | |
5295 if (unicode_decode_call_errorhandler( | |
5296 errors, | |
5297 &errorHandler, | |
5298 "utf16", errmsg, | |
5299 &starts, | |
5300 (const char **)&e, | |
5301 &startinpos, | |
5302 &endinpos, | |
5303 &exc, | |
5304 (const char **)&q, | |
5305 &unicode, | |
5306 &outpos)) | |
5307 goto onError; | |
5308 } | |
5309 /* remaining byte at the end? (size should be even) */ | |
5310 if (e == q) { | |
5311 if (!consumed) { | |
5312 errmsg = "truncated data"; | |
5313 startinpos = ((const char *)q) - starts; | |
5314 endinpos = ((const char *)e) + 1 - starts; | |
5315 if (unicode_decode_call_errorhandler( | |
5316 errors, | |
5317 &errorHandler, | |
5318 "utf16", errmsg, | |
5319 &starts, | |
5320 (const char **)&e, | |
5321 &startinpos, | |
5322 &endinpos, | |
5323 &exc, | |
5324 (const char **)&q, | |
5325 &unicode, | |
5326 &outpos)) | |
5327 goto onError; | |
5328 /* The remaining input chars are ignored if the callback | |
5329 chooses to skip the input */ | |
5330 } | |
5331 } | |
5332 | |
5333 if (byteorder) | |
5334 *byteorder = bo; | |
5335 | |
5336 if (consumed) | |
5337 *consumed = (const char *)q-starts; | |
5338 | |
5339 /* Adjust length */ | |
5340 if (PyUnicode_Resize(&unicode, outpos) < 0) | |
5341 goto onError; | |
5342 | |
5343 Py_XDECREF(errorHandler); | |
5344 Py_XDECREF(exc); | |
5345 return unicode_result(unicode); | |
5346 | |
5347 onError: | |
5348 Py_DECREF(unicode); | |
5349 Py_XDECREF(errorHandler); | |
5350 Py_XDECREF(exc); | |
5351 return NULL; | |
5352 } | |
5353 | |
5354 #undef FAST_CHAR_MASK | |
5355 #undef SWAPPED_FAST_CHAR_MASK | |
5356 | |
5357 PyObject * | |
5358 _PyUnicode_EncodeUTF16(PyObject *str, | |
5359 const char *errors, | |
5360 int byteorder) | |
5361 { | |
5362 int kind; | |
5363 void *data; | |
5364 Py_ssize_t len; | |
5365 PyObject *v; | |
5366 unsigned char *p; | |
5367 Py_ssize_t nsize, bytesize; | |
5368 Py_ssize_t i, pairs; | |
5369 /* Offsets from p for storing byte pairs in the right order. */ | |
5370 #ifdef BYTEORDER_IS_LITTLE_ENDIAN | |
5371 int ihi = 1, ilo = 0; | |
5372 #else | |
5373 int ihi = 0, ilo = 1; | |
5374 #endif | |
5375 | |
5376 #define STORECHAR(CH) \ | |
5377 do { \ | |
5378 p[ihi] = ((CH) >> 8) & 0xff; \ | |
5379 p[ilo] = (CH) & 0xff; \ | |
5380 p += 2; \ | |
5381 } while(0) | |
5382 | |
5383 if (!PyUnicode_Check(str)) { | |
5384 PyErr_BadArgument(); | |
5385 return NULL; | |
5386 } | |
5387 if (PyUnicode_READY(str) < 0) | |
5388 return NULL; | |
5389 kind = PyUnicode_KIND(str); | |
5390 data = PyUnicode_DATA(str); | |
5391 len = PyUnicode_GET_LENGTH(str); | |
5392 | |
5393 pairs = 0; | |
5394 if (kind == PyUnicode_4BYTE_KIND) | |
5395 for (i = 0; i < len; i++) | |
5396 if (PyUnicode_READ(kind, data, i) >= 0x10000) | |
5397 pairs++; | |
5398 /* 2 * (len + pairs + (byteorder == 0)) */ | |
5399 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) | |
5400 return PyErr_NoMemory(); | |
5401 nsize = len + pairs + (byteorder == 0); | |
5402 bytesize = nsize * 2; | |
5403 if (bytesize / 2 != nsize) | |
5404 return PyErr_NoMemory(); | |
5405 v = PyBytes_FromStringAndSize(NULL, bytesize); | |
5406 if (v == NULL) | |
5407 return NULL; | |
5408 | |
5409 p = (unsigned char *)PyBytes_AS_STRING(v); | |
5410 if (byteorder == 0) | |
5411 STORECHAR(0xFEFF); | |
5412 if (len == 0) | |
5413 goto done; | |
5414 | |
5415 if (byteorder == -1) { | |
5416 /* force LE */ | |
5417 ihi = 1; | |
5418 ilo = 0; | |
5419 } | |
5420 else if (byteorder == 1) { | |
5421 /* force BE */ | |
5422 ihi = 0; | |
5423 ilo = 1; | |
5424 } | |
5425 | |
5426 for (i = 0; i < len; i++) { | |
5427 Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
5428 Py_UCS4 ch2 = 0; | |
5429 if (ch >= 0x10000) { | |
5430 ch2 = Py_UNICODE_LOW_SURROGATE(ch); | |
5431 ch = Py_UNICODE_HIGH_SURROGATE(ch); | |
5432 } | |
5433 STORECHAR(ch); | |
5434 if (ch2) | |
5435 STORECHAR(ch2); | |
5436 } | |
5437 | |
5438 done: | |
5439 return v; | |
5440 #undef STORECHAR | |
5441 } | |
5442 | |
5443 PyObject * | |
5444 PyUnicode_EncodeUTF16(const Py_UNICODE *s, | |
5445 Py_ssize_t size, | |
5446 const char *errors, | |
5447 int byteorder) | |
5448 { | |
5449 PyObject *result; | |
5450 PyObject *tmp = PyUnicode_FromUnicode(s, size); | |
5451 if (tmp == NULL) | |
5452 return NULL; | |
5453 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); | |
5454 Py_DECREF(tmp); | |
5455 return result; | |
5456 } | |
5457 | |
5458 PyObject * | |
5459 PyUnicode_AsUTF16String(PyObject *unicode) | |
5460 { | |
5461 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); | |
5462 } | |
5463 | |
5464 /* --- Unicode Escape Codec ----------------------------------------------- */ | |
5465 | |
5466 /* Helper function for PyUnicode_DecodeUnicodeEscape, determines | |
5467 if all the escapes in the string make it still a valid ASCII string. | |
5468 Returns -1 if any escapes were found which cause the string to | |
5469 pop out of ASCII range. Otherwise returns the length of the | |
5470 required buffer to hold the string. | |
5471 */ | |
5472 static Py_ssize_t | |
5473 length_of_escaped_ascii_string(const char *s, Py_ssize_t size) | |
5474 { | |
5475 const unsigned char *p = (const unsigned char *)s; | |
5476 const unsigned char *end = p + size; | |
5477 Py_ssize_t length = 0; | |
5478 | |
5479 if (size < 0) | |
5480 return -1; | |
5481 | |
5482 for (; p < end; ++p) { | |
5483 if (*p > 127) { | |
5484 /* Non-ASCII */ | |
5485 return -1; | |
5486 } | |
5487 else if (*p != '\\') { | |
5488 /* Normal character */ | |
5489 ++length; | |
5490 } | |
5491 else { | |
5492 /* Backslash-escape, check next char */ | |
5493 ++p; | |
5494 /* Escape sequence reaches till end of string or | |
5495 non-ASCII follow-up. */ | |
5496 if (p >= end || *p > 127) | |
5497 return -1; | |
5498 switch (*p) { | |
5499 case '\n': | |
5500 /* backslash + \n result in zero characters */ | |
5501 break; | |
5502 case '\\': case '\'': case '\"': | |
5503 case 'b': case 'f': case 't': | |
5504 case 'n': case 'r': case 'v': case 'a': | |
5505 ++length; | |
5506 break; | |
5507 case '0': case '1': case '2': case '3': | |
5508 case '4': case '5': case '6': case '7': | |
5509 case 'x': case 'u': case 'U': case 'N': | |
5510 /* these do not guarantee ASCII characters */ | |
5511 return -1; | |
5512 default: | |
5513 /* count the backslash + the other character */ | |
5514 length += 2; | |
5515 } | |
5516 } | |
5517 } | |
5518 return length; | |
5519 } | |
5520 | |
5521 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; | |
5522 | |
5523 PyObject * | |
5524 PyUnicode_DecodeUnicodeEscape(const char *s, | |
5525 Py_ssize_t size, | |
5526 const char *errors) | |
5527 { | |
5528 const char *starts = s; | |
5529 Py_ssize_t startinpos; | |
5530 Py_ssize_t endinpos; | |
5531 int j; | |
5532 PyObject *v; | |
5533 const char *end; | |
5534 char* message; | |
5535 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ | |
5536 PyObject *errorHandler = NULL; | |
5537 PyObject *exc = NULL; | |
5538 Py_ssize_t len; | |
5539 Py_ssize_t i; | |
5540 | |
5541 len = length_of_escaped_ascii_string(s, size); | |
5542 | |
5543 /* After length_of_escaped_ascii_string() there are two alternatives, | |
5544 either the string is pure ASCII with named escapes like \n, etc. | |
5545 and we determined it's exact size (common case) | |
5546 or it contains \x, \u, ... escape sequences. then we create a | |
5547 legacy wchar string and resize it at the end of this function. */ | |
5548 if (len >= 0) { | |
5549 v = PyUnicode_New(len, 127); | |
5550 if (!v) | |
5551 goto onError; | |
5552 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); | |
5553 } | |
5554 else { | |
5555 /* Escaped strings will always be longer than the resulting | |
5556 Unicode string, so we start with size here and then reduce the | |
5557 length after conversion to the true value. | |
5558 (but if the error callback returns a long replacement string | |
5559 we'll have to allocate more space) */ | |
5560 v = PyUnicode_New(size, 127); | |
5561 if (!v) | |
5562 goto onError; | |
5563 len = size; | |
5564 } | |
5565 | |
5566 if (size == 0) | |
5567 return v; | |
5568 i = 0; | |
5569 end = s + size; | |
5570 | |
5571 while (s < end) { | |
5572 unsigned char c; | |
5573 Py_UCS4 x; | |
5574 int digits; | |
5575 | |
5576 /* The only case in which i == ascii_length is a backslash | |
5577 followed by a newline. */ | |
5578 assert(i <= len); | |
5579 | |
5580 /* Non-escape characters are interpreted as Unicode ordinals */ | |
5581 if (*s != '\\') { | |
5582 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) | |
5583 goto onError; | |
5584 continue; | |
5585 } | |
5586 | |
5587 startinpos = s-starts; | |
5588 /* \ - Escapes */ | |
5589 s++; | |
5590 c = *s++; | |
5591 if (s > end) | |
5592 c = '\0'; /* Invalid after \ */ | |
5593 | |
5594 /* The only case in which i == ascii_length is a backslash | |
5595 followed by a newline. */ | |
5596 assert(i < len || (i == len && c == '\n')); | |
5597 | |
5598 switch (c) { | |
5599 | |
5600 /* \x escapes */ | |
5601 #define WRITECHAR(ch) \ | |
5602 do { \ | |
5603 if (unicode_putchar(&v, &i, ch) < 0) \ | |
5604 goto onError; \ | |
5605 }while(0) | |
5606 | |
5607 case '\n': break; | |
5608 case '\\': WRITECHAR('\\'); break; | |
5609 case '\'': WRITECHAR('\''); break; | |
5610 case '\"': WRITECHAR('\"'); break; | |
5611 case 'b': WRITECHAR('\b'); break; | |
5612 /* FF */ | |
5613 case 'f': WRITECHAR('\014'); break; | |
5614 case 't': WRITECHAR('\t'); break; | |
5615 case 'n': WRITECHAR('\n'); break; | |
5616 case 'r': WRITECHAR('\r'); break; | |
5617 /* VT */ | |
5618 case 'v': WRITECHAR('\013'); break; | |
5619 /* BEL, not classic C */ | |
5620 case 'a': WRITECHAR('\007'); break; | |
5621 | |
5622 /* \OOO (octal) escapes */ | |
5623 case '0': case '1': case '2': case '3': | |
5624 case '4': case '5': case '6': case '7': | |
5625 x = s[-1] - '0'; | |
5626 if (s < end && '0' <= *s && *s <= '7') { | |
5627 x = (x<<3) + *s++ - '0'; | |
5628 if (s < end && '0' <= *s && *s <= '7') | |
5629 x = (x<<3) + *s++ - '0'; | |
5630 } | |
5631 WRITECHAR(x); | |
5632 break; | |
5633 | |
5634 /* hex escapes */ | |
5635 /* \xXX */ | |
5636 case 'x': | |
5637 digits = 2; | |
5638 message = "truncated \\xXX escape"; | |
5639 goto hexescape; | |
5640 | |
5641 /* \uXXXX */ | |
5642 case 'u': | |
5643 digits = 4; | |
5644 message = "truncated \\uXXXX escape"; | |
5645 goto hexescape; | |
5646 | |
5647 /* \UXXXXXXXX */ | |
5648 case 'U': | |
5649 digits = 8; | |
5650 message = "truncated \\UXXXXXXXX escape"; | |
5651 hexescape: | |
5652 chr = 0; | |
5653 if (s+digits>end) { | |
5654 endinpos = size; | |
5655 if (unicode_decode_call_errorhandler( | |
5656 errors, &errorHandler, | |
5657 "unicodeescape", "end of string in escape sequence", | |
5658 &starts, &end, &startinpos, &endinpos, &exc, &s, | |
5659 &v, &i)) | |
5660 goto onError; | |
5661 goto nextByte; | |
5662 } | |
5663 for (j = 0; j < digits; ++j) { | |
5664 c = (unsigned char) s[j]; | |
5665 if (!Py_ISXDIGIT(c)) { | |
5666 endinpos = (s+j+1)-starts; | |
5667 if (unicode_decode_call_errorhandler( | |
5668 errors, &errorHandler, | |
5669 "unicodeescape", message, | |
5670 &starts, &end, &startinpos, &endinpos, &exc, &s, | |
5671 &v, &i)) | |
5672 goto onError; | |
5673 len = PyUnicode_GET_LENGTH(v); | |
5674 goto nextByte; | |
5675 } | |
5676 chr = (chr<<4) & ~0xF; | |
5677 if (c >= '0' && c <= '9') | |
5678 chr += c - '0'; | |
5679 else if (c >= 'a' && c <= 'f') | |
5680 chr += 10 + c - 'a'; | |
5681 else | |
5682 chr += 10 + c - 'A'; | |
5683 } | |
5684 s += j; | |
5685 if (chr == 0xffffffff && PyErr_Occurred()) | |
5686 /* _decoding_error will have already written into the | |
5687 target buffer. */ | |
5688 break; | |
5689 store: | |
5690 /* when we get here, chr is a 32-bit unicode character */ | |
5691 if (chr <= MAX_UNICODE) { | |
5692 WRITECHAR(chr); | |
5693 } else { | |
5694 endinpos = s-starts; | |
5695 if (unicode_decode_call_errorhandler( | |
5696 errors, &errorHandler, | |
5697 "unicodeescape", "illegal Unicode character", | |
5698 &starts, &end, &startinpos, &endinpos, &exc, &s, | |
5699 &v, &i)) | |
5700 goto onError; | |
5701 } | |
5702 break; | |
5703 | |
5704 /* \N{name} */ | |
5705 case 'N': | |
5706 message = "malformed \\N character escape"; | |
5707 if (ucnhash_CAPI == NULL) { | |
5708 /* load the unicode data module */ | |
5709 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( | |
5710 PyUnicodeData_CAPSULE_NAME, 1); | |
5711 if (ucnhash_CAPI == NULL) | |
5712 goto ucnhashError; | |
5713 } | |
5714 if (*s == '{') { | |
5715 const char *start = s+1; | |
5716 /* look for the closing brace */ | |
5717 while (*s != '}' && s < end) | |
5718 s++; | |
5719 if (s > start && s < end && *s == '}') { | |
5720 /* found a name. look it up in the unicode database */ | |
5721 message = "unknown Unicode character name"; | |
5722 s++; | |
5723 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), | |
5724 &chr, 0)) | |
5725 goto store; | |
5726 } | |
5727 } | |
5728 endinpos = s-starts; | |
5729 if (unicode_decode_call_errorhandler( | |
5730 errors, &errorHandler, | |
5731 "unicodeescape", message, | |
5732 &starts, &end, &startinpos, &endinpos, &exc, &s, | |
5733 &v, &i)) | |
5734 goto onError; | |
5735 break; | |
5736 | |
5737 default: | |
5738 if (s > end) { | |
5739 message = "\\ at end of string"; | |
5740 s--; | |
5741 endinpos = s-starts; | |
5742 if (unicode_decode_call_errorhandler( | |
5743 errors, &errorHandler, | |
5744 "unicodeescape", message, | |
5745 &starts, &end, &startinpos, &endinpos, &exc, &s, | |
5746 &v, &i)) | |
5747 goto onError; | |
5748 } | |
5749 else { | |
5750 WRITECHAR('\\'); | |
5751 WRITECHAR(s[-1]); | |
5752 } | |
5753 break; | |
5754 } | |
5755 nextByte: | |
5756 ; | |
5757 } | |
5758 #undef WRITECHAR | |
5759 | |
5760 if (PyUnicode_Resize(&v, i) < 0) | |
5761 goto onError; | |
5762 Py_XDECREF(errorHandler); | |
5763 Py_XDECREF(exc); | |
5764 return unicode_result(v); | |
5765 | |
5766 ucnhashError: | |
5767 PyErr_SetString( | |
5768 PyExc_UnicodeError, | |
5769 "\\N escapes not supported (can't load unicodedata module)" | |
5770 ); | |
5771 Py_XDECREF(v); | |
5772 Py_XDECREF(errorHandler); | |
5773 Py_XDECREF(exc); | |
5774 return NULL; | |
5775 | |
5776 onError: | |
5777 Py_XDECREF(v); | |
5778 Py_XDECREF(errorHandler); | |
5779 Py_XDECREF(exc); | |
5780 return NULL; | |
5781 } | |
5782 | |
5783 /* Return a Unicode-Escape string version of the Unicode object. | |
5784 | |
5785 If quotes is true, the string is enclosed in u"" or u'' quotes as | |
5786 appropriate. | |
5787 | |
5788 */ | |
5789 | |
5790 PyObject * | |
5791 PyUnicode_AsUnicodeEscapeString(PyObject *unicode) | |
5792 { | |
5793 Py_ssize_t i, len; | |
5794 PyObject *repr; | |
5795 char *p; | |
5796 int kind; | |
5797 void *data; | |
5798 Py_ssize_t expandsize = 0; | |
5799 | |
5800 /* Initial allocation is based on the longest-possible unichr | |
5801 escape. | |
5802 | |
5803 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source | |
5804 unichr, so in this case it's the longest unichr escape. In | |
5805 narrow (UTF-16) builds this is five chars per source unichr | |
5806 since there are two unichrs in the surrogate pair, so in narrow | |
5807 (UTF-16) builds it's not the longest unichr escape. | |
5808 | |
5809 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, | |
5810 so in the narrow (UTF-16) build case it's the longest unichr | |
5811 escape. | |
5812 */ | |
5813 | |
5814 if (!PyUnicode_Check(unicode)) { | |
5815 PyErr_BadArgument(); | |
5816 return NULL; | |
5817 } | |
5818 if (PyUnicode_READY(unicode) < 0) | |
5819 return NULL; | |
5820 len = PyUnicode_GET_LENGTH(unicode); | |
5821 kind = PyUnicode_KIND(unicode); | |
5822 data = PyUnicode_DATA(unicode); | |
5823 switch(kind) { | |
5824 case PyUnicode_1BYTE_KIND: expandsize = 4; break; | |
5825 case PyUnicode_2BYTE_KIND: expandsize = 6; break; | |
5826 case PyUnicode_4BYTE_KIND: expandsize = 10; break; | |
5827 } | |
5828 | |
5829 if (len == 0) | |
5830 return PyBytes_FromStringAndSize(NULL, 0); | |
5831 | |
5832 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) | |
5833 return PyErr_NoMemory(); | |
5834 | |
5835 repr = PyBytes_FromStringAndSize(NULL, | |
5836 2 | |
5837 + expandsize*len | |
5838 + 1); | |
5839 if (repr == NULL) | |
5840 return NULL; | |
5841 | |
5842 p = PyBytes_AS_STRING(repr); | |
5843 | |
5844 for (i = 0; i < len; i++) { | |
5845 Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
5846 | |
5847 /* Escape backslashes */ | |
5848 if (ch == '\\') { | |
5849 *p++ = '\\'; | |
5850 *p++ = (char) ch; | |
5851 continue; | |
5852 } | |
5853 | |
5854 /* Map 21-bit characters to '\U00xxxxxx' */ | |
5855 else if (ch >= 0x10000) { | |
5856 assert(ch <= MAX_UNICODE); | |
5857 *p++ = '\\'; | |
5858 *p++ = 'U'; | |
5859 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; | |
5860 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; | |
5861 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; | |
5862 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; | |
5863 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; | |
5864 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; | |
5865 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; | |
5866 *p++ = Py_hexdigits[ch & 0x0000000F]; | |
5867 continue; | |
5868 } | |
5869 | |
5870 /* Map 16-bit characters to '\uxxxx' */ | |
5871 if (ch >= 256) { | |
5872 *p++ = '\\'; | |
5873 *p++ = 'u'; | |
5874 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; | |
5875 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; | |
5876 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; | |
5877 *p++ = Py_hexdigits[ch & 0x000F]; | |
5878 } | |
5879 | |
5880 /* Map special whitespace to '\t', \n', '\r' */ | |
5881 else if (ch == '\t') { | |
5882 *p++ = '\\'; | |
5883 *p++ = 't'; | |
5884 } | |
5885 else if (ch == '\n') { | |
5886 *p++ = '\\'; | |
5887 *p++ = 'n'; | |
5888 } | |
5889 else if (ch == '\r') { | |
5890 *p++ = '\\'; | |
5891 *p++ = 'r'; | |
5892 } | |
5893 | |
5894 /* Map non-printable US ASCII to '\xhh' */ | |
5895 else if (ch < ' ' || ch >= 0x7F) { | |
5896 *p++ = '\\'; | |
5897 *p++ = 'x'; | |
5898 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; | |
5899 *p++ = Py_hexdigits[ch & 0x000F]; | |
5900 } | |
5901 | |
5902 /* Copy everything else as-is */ | |
5903 else | |
5904 *p++ = (char) ch; | |
5905 } | |
5906 | |
5907 assert(p - PyBytes_AS_STRING(repr) > 0); | |
5908 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) | |
5909 return NULL; | |
5910 return repr; | |
5911 } | |
5912 | |
5913 PyObject * | |
5914 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, | |
5915 Py_ssize_t size) | |
5916 { | |
5917 PyObject *result; | |
5918 PyObject *tmp = PyUnicode_FromUnicode(s, size); | |
5919 if (tmp == NULL) | |
5920 return NULL; | |
5921 result = PyUnicode_AsUnicodeEscapeString(tmp); | |
5922 Py_DECREF(tmp); | |
5923 return result; | |
5924 } | |
5925 | |
5926 /* --- Raw Unicode Escape Codec ------------------------------------------- */ | |
5927 | |
5928 PyObject * | |
5929 PyUnicode_DecodeRawUnicodeEscape(const char *s, | |
5930 Py_ssize_t size, | |
5931 const char *errors) | |
5932 { | |
5933 const char *starts = s; | |
5934 Py_ssize_t startinpos; | |
5935 Py_ssize_t endinpos; | |
5936 Py_ssize_t outpos; | |
5937 PyObject *v; | |
5938 const char *end; | |
5939 const char *bs; | |
5940 PyObject *errorHandler = NULL; | |
5941 PyObject *exc = NULL; | |
5942 | |
5943 /* Escaped strings will always be longer than the resulting | |
5944 Unicode string, so we start with size here and then reduce the | |
5945 length after conversion to the true value. (But decoding error | |
5946 handler might have to resize the string) */ | |
5947 v = PyUnicode_New(size, 127); | |
5948 if (v == NULL) | |
5949 goto onError; | |
5950 if (size == 0) | |
5951 return v; | |
5952 outpos = 0; | |
5953 end = s + size; | |
5954 while (s < end) { | |
5955 unsigned char c; | |
5956 Py_UCS4 x; | |
5957 int i; | |
5958 int count; | |
5959 | |
5960 /* Non-escape characters are interpreted as Unicode ordinals */ | |
5961 if (*s != '\\') { | |
5962 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) | |
5963 goto onError; | |
5964 continue; | |
5965 } | |
5966 startinpos = s-starts; | |
5967 | |
5968 /* \u-escapes are only interpreted iff the number of leading | |
5969 backslashes if odd */ | |
5970 bs = s; | |
5971 for (;s < end;) { | |
5972 if (*s != '\\') | |
5973 break; | |
5974 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) | |
5975 goto onError; | |
5976 } | |
5977 if (((s - bs) & 1) == 0 || | |
5978 s >= end || | |
5979 (*s != 'u' && *s != 'U')) { | |
5980 continue; | |
5981 } | |
5982 outpos--; | |
5983 count = *s=='u' ? 4 : 8; | |
5984 s++; | |
5985 | |
5986 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ | |
5987 for (x = 0, i = 0; i < count; ++i, ++s) { | |
5988 c = (unsigned char)*s; | |
5989 if (!Py_ISXDIGIT(c)) { | |
5990 endinpos = s-starts; | |
5991 if (unicode_decode_call_errorhandler( | |
5992 errors, &errorHandler, | |
5993 "rawunicodeescape", "truncated \\uXXXX", | |
5994 &starts, &end, &startinpos, &endinpos, &exc, &s, | |
5995 &v, &outpos)) | |
5996 goto onError; | |
5997 goto nextByte; | |
5998 } | |
5999 x = (x<<4) & ~0xF; | |
6000 if (c >= '0' && c <= '9') | |
6001 x += c - '0'; | |
6002 else if (c >= 'a' && c <= 'f') | |
6003 x += 10 + c - 'a'; | |
6004 else | |
6005 x += 10 + c - 'A'; | |
6006 } | |
6007 if (x <= MAX_UNICODE) { | |
6008 if (unicode_putchar(&v, &outpos, x) < 0) | |
6009 goto onError; | |
6010 } else { | |
6011 endinpos = s-starts; | |
6012 if (unicode_decode_call_errorhandler( | |
6013 errors, &errorHandler, | |
6014 "rawunicodeescape", "\\Uxxxxxxxx out of range", | |
6015 &starts, &end, &startinpos, &endinpos, &exc, &s, | |
6016 &v, &outpos)) | |
6017 goto onError; | |
6018 } | |
6019 nextByte: | |
6020 ; | |
6021 } | |
6022 if (PyUnicode_Resize(&v, outpos) < 0) | |
6023 goto onError; | |
6024 Py_XDECREF(errorHandler); | |
6025 Py_XDECREF(exc); | |
6026 return unicode_result(v); | |
6027 | |
6028 onError: | |
6029 Py_XDECREF(v); | |
6030 Py_XDECREF(errorHandler); | |
6031 Py_XDECREF(exc); | |
6032 return NULL; | |
6033 } | |
6034 | |
6035 | |
6036 PyObject * | |
6037 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) | |
6038 { | |
6039 PyObject *repr; | |
6040 char *p; | |
6041 char *q; | |
6042 Py_ssize_t expandsize, pos; | |
6043 int kind; | |
6044 void *data; | |
6045 Py_ssize_t len; | |
6046 | |
6047 if (!PyUnicode_Check(unicode)) { | |
6048 PyErr_BadArgument(); | |
6049 return NULL; | |
6050 } | |
6051 if (PyUnicode_READY(unicode) < 0) | |
6052 return NULL; | |
6053 kind = PyUnicode_KIND(unicode); | |
6054 data = PyUnicode_DATA(unicode); | |
6055 len = PyUnicode_GET_LENGTH(unicode); | |
6056 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 | |
6057 bytes, and 1 byte characters 4. */ | |
6058 expandsize = kind * 2 + 2; | |
6059 | |
6060 if (len > PY_SSIZE_T_MAX / expandsize) | |
6061 return PyErr_NoMemory(); | |
6062 | |
6063 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); | |
6064 if (repr == NULL) | |
6065 return NULL; | |
6066 if (len == 0) | |
6067 return repr; | |
6068 | |
6069 p = q = PyBytes_AS_STRING(repr); | |
6070 for (pos = 0; pos < len; pos++) { | |
6071 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); | |
6072 /* Map 32-bit characters to '\Uxxxxxxxx' */ | |
6073 if (ch >= 0x10000) { | |
6074 assert(ch <= MAX_UNICODE); | |
6075 *p++ = '\\'; | |
6076 *p++ = 'U'; | |
6077 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; | |
6078 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; | |
6079 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; | |
6080 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; | |
6081 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; | |
6082 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; | |
6083 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; | |
6084 *p++ = Py_hexdigits[ch & 15]; | |
6085 } | |
6086 /* Map 16-bit characters to '\uxxxx' */ | |
6087 else if (ch >= 256) { | |
6088 *p++ = '\\'; | |
6089 *p++ = 'u'; | |
6090 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; | |
6091 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; | |
6092 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; | |
6093 *p++ = Py_hexdigits[ch & 15]; | |
6094 } | |
6095 /* Copy everything else as-is */ | |
6096 else | |
6097 *p++ = (char) ch; | |
6098 } | |
6099 | |
6100 assert(p > q); | |
6101 if (_PyBytes_Resize(&repr, p - q) < 0) | |
6102 return NULL; | |
6103 return repr; | |
6104 } | |
6105 | |
6106 PyObject * | |
6107 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, | |
6108 Py_ssize_t size) | |
6109 { | |
6110 PyObject *result; | |
6111 PyObject *tmp = PyUnicode_FromUnicode(s, size); | |
6112 if (tmp == NULL) | |
6113 return NULL; | |
6114 result = PyUnicode_AsRawUnicodeEscapeString(tmp); | |
6115 Py_DECREF(tmp); | |
6116 return result; | |
6117 } | |
6118 | |
6119 /* --- Unicode Internal Codec ------------------------------------------- */ | |
6120 | |
6121 PyObject * | |
6122 _PyUnicode_DecodeUnicodeInternal(const char *s, | |
6123 Py_ssize_t size, | |
6124 const char *errors) | |
6125 { | |
6126 const char *starts = s; | |
6127 Py_ssize_t startinpos; | |
6128 Py_ssize_t endinpos; | |
6129 Py_ssize_t outpos; | |
6130 PyObject *v; | |
6131 const char *end; | |
6132 const char *reason; | |
6133 PyObject *errorHandler = NULL; | |
6134 PyObject *exc = NULL; | |
6135 | |
6136 if (PyErr_WarnEx(PyExc_DeprecationWarning, | |
6137 "unicode_internal codec has been deprecated", | |
6138 1)) | |
6139 return NULL; | |
6140 | |
6141 /* XXX overflow detection missing */ | |
6142 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); | |
6143 if (v == NULL) | |
6144 goto onError; | |
6145 if (PyUnicode_GET_LENGTH(v) == 0) | |
6146 return v; | |
6147 outpos = 0; | |
6148 end = s + size; | |
6149 | |
6150 while (s < end) { | |
6151 Py_UNICODE uch; | |
6152 Py_UCS4 ch; | |
6153 /* We copy the raw representation one byte at a time because the | |
6154 pointer may be unaligned (see test_codeccallbacks). */ | |
6155 ((char *) &uch)[0] = s[0]; | |
6156 ((char *) &uch)[1] = s[1]; | |
6157 #ifdef Py_UNICODE_WIDE | |
6158 ((char *) &uch)[2] = s[2]; | |
6159 ((char *) &uch)[3] = s[3]; | |
6160 #endif | |
6161 ch = uch; | |
6162 | |
6163 /* We have to sanity check the raw data, otherwise doom looms for | |
6164 some malformed UCS-4 data. */ | |
6165 if ( | |
6166 #ifdef Py_UNICODE_WIDE | |
6167 ch > 0x10ffff || | |
6168 #endif | |
6169 end-s < Py_UNICODE_SIZE | |
6170 ) | |
6171 { | |
6172 startinpos = s - starts; | |
6173 if (end-s < Py_UNICODE_SIZE) { | |
6174 endinpos = end-starts; | |
6175 reason = "truncated input"; | |
6176 } | |
6177 else { | |
6178 endinpos = s - starts + Py_UNICODE_SIZE; | |
6179 reason = "illegal code point (> 0x10FFFF)"; | |
6180 } | |
6181 if (unicode_decode_call_errorhandler( | |
6182 errors, &errorHandler, | |
6183 "unicode_internal", reason, | |
6184 &starts, &end, &startinpos, &endinpos, &exc, &s, | |
6185 &v, &outpos)) | |
6186 goto onError; | |
6187 continue; | |
6188 } | |
6189 | |
6190 s += Py_UNICODE_SIZE; | |
6191 #ifndef Py_UNICODE_WIDE | |
6192 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end) | |
6193 { | |
6194 Py_UNICODE uch2; | |
6195 ((char *) &uch2)[0] = s[0]; | |
6196 ((char *) &uch2)[1] = s[1]; | |
6197 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) | |
6198 { | |
6199 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); | |
6200 s += Py_UNICODE_SIZE; | |
6201 } | |
6202 } | |
6203 #endif | |
6204 | |
6205 if (unicode_putchar(&v, &outpos, ch) < 0) | |
6206 goto onError; | |
6207 } | |
6208 | |
6209 if (PyUnicode_Resize(&v, outpos) < 0) | |
6210 goto onError; | |
6211 Py_XDECREF(errorHandler); | |
6212 Py_XDECREF(exc); | |
6213 return unicode_result(v); | |
6214 | |
6215 onError: | |
6216 Py_XDECREF(v); | |
6217 Py_XDECREF(errorHandler); | |
6218 Py_XDECREF(exc); | |
6219 return NULL; | |
6220 } | |
6221 | |
6222 /* --- Latin-1 Codec ------------------------------------------------------ */ | |
6223 | |
6224 PyObject * | |
6225 PyUnicode_DecodeLatin1(const char *s, | |
6226 Py_ssize_t size, | |
6227 const char *errors) | |
6228 { | |
6229 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ | |
6230 return _PyUnicode_FromUCS1((unsigned char*)s, size); | |
6231 } | |
6232 | |
6233 /* create or adjust a UnicodeEncodeError */ | |
6234 static void | |
6235 make_encode_exception(PyObject **exceptionObject, | |
6236 const char *encoding, | |
6237 PyObject *unicode, | |
6238 Py_ssize_t startpos, Py_ssize_t endpos, | |
6239 const char *reason) | |
6240 { | |
6241 if (*exceptionObject == NULL) { | |
6242 *exceptionObject = PyObject_CallFunction( | |
6243 PyExc_UnicodeEncodeError, "sOnns", | |
6244 encoding, unicode, startpos, endpos, reason); | |
6245 } | |
6246 else { | |
6247 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) | |
6248 goto onError; | |
6249 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) | |
6250 goto onError; | |
6251 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) | |
6252 goto onError; | |
6253 return; | |
6254 onError: | |
6255 Py_DECREF(*exceptionObject); | |
6256 *exceptionObject = NULL; | |
6257 } | |
6258 } | |
6259 | |
6260 /* raises a UnicodeEncodeError */ | |
6261 static void | |
6262 raise_encode_exception(PyObject **exceptionObject, | |
6263 const char *encoding, | |
6264 PyObject *unicode, | |
6265 Py_ssize_t startpos, Py_ssize_t endpos, | |
6266 const char *reason) | |
6267 { | |
6268 make_encode_exception(exceptionObject, | |
6269 encoding, unicode, startpos, endpos, reason); | |
6270 if (*exceptionObject != NULL) | |
6271 PyCodec_StrictErrors(*exceptionObject); | |
6272 } | |
6273 | |
6274 /* error handling callback helper: | |
6275 build arguments, call the callback and check the arguments, | |
6276 put the result into newpos and return the replacement string, which | |
6277 has to be freed by the caller */ | |
6278 static PyObject * | |
6279 unicode_encode_call_errorhandler(const char *errors, | |
6280 PyObject **errorHandler, | |
6281 const char *encoding, const char *reason, | |
6282 PyObject *unicode, PyObject **exceptionObject, | |
6283 Py_ssize_t startpos, Py_ssize_t endpos, | |
6284 Py_ssize_t *newpos) | |
6285 { | |
6286 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; | |
6287 Py_ssize_t len; | |
6288 PyObject *restuple; | |
6289 PyObject *resunicode; | |
6290 | |
6291 if (*errorHandler == NULL) { | |
6292 *errorHandler = PyCodec_LookupError(errors); | |
6293 if (*errorHandler == NULL) | |
6294 return NULL; | |
6295 } | |
6296 | |
6297 if (PyUnicode_READY(unicode) < 0) | |
6298 return NULL; | |
6299 len = PyUnicode_GET_LENGTH(unicode); | |
6300 | |
6301 make_encode_exception(exceptionObject, | |
6302 encoding, unicode, startpos, endpos, reason); | |
6303 if (*exceptionObject == NULL) | |
6304 return NULL; | |
6305 | |
6306 restuple = PyObject_CallFunctionObjArgs( | |
6307 *errorHandler, *exceptionObject, NULL); | |
6308 if (restuple == NULL) | |
6309 return NULL; | |
6310 if (!PyTuple_Check(restuple)) { | |
6311 PyErr_SetString(PyExc_TypeError, &argparse[3]); | |
6312 Py_DECREF(restuple); | |
6313 return NULL; | |
6314 } | |
6315 if (!PyArg_ParseTuple(restuple, argparse, | |
6316 &resunicode, newpos)) { | |
6317 Py_DECREF(restuple); | |
6318 return NULL; | |
6319 } | |
6320 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { | |
6321 PyErr_SetString(PyExc_TypeError, &argparse[3]); | |
6322 Py_DECREF(restuple); | |
6323 return NULL; | |
6324 } | |
6325 if (*newpos<0) | |
6326 *newpos = len + *newpos; | |
6327 if (*newpos<0 || *newpos>len) { | |
6328 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); | |
6329 Py_DECREF(restuple); | |
6330 return NULL; | |
6331 } | |
6332 Py_INCREF(resunicode); | |
6333 Py_DECREF(restuple); | |
6334 return resunicode; | |
6335 } | |
6336 | |
6337 static PyObject * | |
6338 unicode_encode_ucs1(PyObject *unicode, | |
6339 const char *errors, | |
6340 unsigned int limit) | |
6341 { | |
6342 /* input state */ | |
6343 Py_ssize_t pos=0, size; | |
6344 int kind; | |
6345 void *data; | |
6346 /* output object */ | |
6347 PyObject *res; | |
6348 /* pointer into the output */ | |
6349 char *str; | |
6350 /* current output position */ | |
6351 Py_ssize_t ressize; | |
6352 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; | |
6353 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; | |
6354 PyObject *errorHandler = NULL; | |
6355 PyObject *exc = NULL; | |
6356 /* the following variable is used for caching string comparisons | |
6357 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ | |
6358 int known_errorHandler = -1; | |
6359 | |
6360 if (PyUnicode_READY(unicode) < 0) | |
6361 return NULL; | |
6362 size = PyUnicode_GET_LENGTH(unicode); | |
6363 kind = PyUnicode_KIND(unicode); | |
6364 data = PyUnicode_DATA(unicode); | |
6365 /* allocate enough for a simple encoding without | |
6366 replacements, if we need more, we'll resize */ | |
6367 if (size == 0) | |
6368 return PyBytes_FromStringAndSize(NULL, 0); | |
6369 res = PyBytes_FromStringAndSize(NULL, size); | |
6370 if (res == NULL) | |
6371 return NULL; | |
6372 str = PyBytes_AS_STRING(res); | |
6373 ressize = size; | |
6374 | |
6375 while (pos < size) { | |
6376 Py_UCS4 c = PyUnicode_READ(kind, data, pos); | |
6377 | |
6378 /* can we encode this? */ | |
6379 if (c<limit) { | |
6380 /* no overflow check, because we know that the space is enough */ | |
6381 *str++ = (char)c; | |
6382 ++pos; | |
6383 } | |
6384 else { | |
6385 Py_ssize_t requiredsize; | |
6386 PyObject *repunicode; | |
6387 Py_ssize_t repsize, newpos, respos, i; | |
6388 /* startpos for collecting unencodable chars */ | |
6389 Py_ssize_t collstart = pos; | |
6390 Py_ssize_t collend = pos; | |
6391 /* find all unecodable characters */ | |
6392 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) | |
6393 ++collend; | |
6394 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ | |
6395 if (known_errorHandler==-1) { | |
6396 if ((errors==NULL) || (!strcmp(errors, "strict"))) | |
6397 known_errorHandler = 1; | |
6398 else if (!strcmp(errors, "replace")) | |
6399 known_errorHandler = 2; | |
6400 else if (!strcmp(errors, "ignore")) | |
6401 known_errorHandler = 3; | |
6402 else if (!strcmp(errors, "xmlcharrefreplace")) | |
6403 known_errorHandler = 4; | |
6404 else | |
6405 known_errorHandler = 0; | |
6406 } | |
6407 switch (known_errorHandler) { | |
6408 case 1: /* strict */ | |
6409 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); | |
6410 goto onError; | |
6411 case 2: /* replace */ | |
6412 while (collstart++<collend) | |
6413 *str++ = '?'; /* fall through */ | |
6414 case 3: /* ignore */ | |
6415 pos = collend; | |
6416 break; | |
6417 case 4: /* xmlcharrefreplace */ | |
6418 respos = str - PyBytes_AS_STRING(res); | |
6419 /* determine replacement size */ | |
6420 for (i = collstart, repsize = 0; i < collend; ++i) { | |
6421 Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
6422 if (ch < 10) | |
6423 repsize += 2+1+1; | |
6424 else if (ch < 100) | |
6425 repsize += 2+2+1; | |
6426 else if (ch < 1000) | |
6427 repsize += 2+3+1; | |
6428 else if (ch < 10000) | |
6429 repsize += 2+4+1; | |
6430 else if (ch < 100000) | |
6431 repsize += 2+5+1; | |
6432 else if (ch < 1000000) | |
6433 repsize += 2+6+1; | |
6434 else { | |
6435 assert(ch <= MAX_UNICODE); | |
6436 repsize += 2+7+1; | |
6437 } | |
6438 } | |
6439 requiredsize = respos+repsize+(size-collend); | |
6440 if (requiredsize > ressize) { | |
6441 if (requiredsize<2*ressize) | |
6442 requiredsize = 2*ressize; | |
6443 if (_PyBytes_Resize(&res, requiredsize)) | |
6444 goto onError; | |
6445 str = PyBytes_AS_STRING(res) + respos; | |
6446 ressize = requiredsize; | |
6447 } | |
6448 /* generate replacement */ | |
6449 for (i = collstart; i < collend; ++i) { | |
6450 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); | |
6451 } | |
6452 pos = collend; | |
6453 break; | |
6454 default: | |
6455 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, | |
6456 encoding, reason, unicode, &exc, | |
6457 collstart, collend, &newpos); | |
6458 if (repunicode == NULL || (PyUnicode_Check(repunicode) && | |
6459 PyUnicode_READY(repunicode) < 0)) | |
6460 goto onError; | |
6461 if (PyBytes_Check(repunicode)) { | |
6462 /* Directly copy bytes result to output. */ | |
6463 repsize = PyBytes_Size(repunicode); | |
6464 if (repsize > 1) { | |
6465 /* Make room for all additional bytes. */ | |
6466 respos = str - PyBytes_AS_STRING(res); | |
6467 if (_PyBytes_Resize(&res, ressize+repsize-1)) { | |
6468 Py_DECREF(repunicode); | |
6469 goto onError; | |
6470 } | |
6471 str = PyBytes_AS_STRING(res) + respos; | |
6472 ressize += repsize-1; | |
6473 } | |
6474 memcpy(str, PyBytes_AsString(repunicode), repsize); | |
6475 str += repsize; | |
6476 pos = newpos; | |
6477 Py_DECREF(repunicode); | |
6478 break; | |
6479 } | |
6480 /* need more space? (at least enough for what we | |
6481 have+the replacement+the rest of the string, so | |
6482 we won't have to check space for encodable characters) */ | |
6483 respos = str - PyBytes_AS_STRING(res); | |
6484 repsize = PyUnicode_GET_LENGTH(repunicode); | |
6485 requiredsize = respos+repsize+(size-collend); | |
6486 if (requiredsize > ressize) { | |
6487 if (requiredsize<2*ressize) | |
6488 requiredsize = 2*ressize; | |
6489 if (_PyBytes_Resize(&res, requiredsize)) { | |
6490 Py_DECREF(repunicode); | |
6491 goto onError; | |
6492 } | |
6493 str = PyBytes_AS_STRING(res) + respos; | |
6494 ressize = requiredsize; | |
6495 } | |
6496 /* check if there is anything unencodable in the replacement | |
6497 and copy it to the output */ | |
6498 for (i = 0; repsize-->0; ++i, ++str) { | |
6499 c = PyUnicode_READ_CHAR(repunicode, i); | |
6500 if (c >= limit) { | |
6501 raise_encode_exception(&exc, encoding, unicode, | |
6502 pos, pos+1, reason); | |
6503 Py_DECREF(repunicode); | |
6504 goto onError; | |
6505 } | |
6506 *str = (char)c; | |
6507 } | |
6508 pos = newpos; | |
6509 Py_DECREF(repunicode); | |
6510 } | |
6511 } | |
6512 } | |
6513 /* Resize if we allocated to much */ | |
6514 size = str - PyBytes_AS_STRING(res); | |
6515 if (size < ressize) { /* If this falls res will be NULL */ | |
6516 assert(size >= 0); | |
6517 if (_PyBytes_Resize(&res, size) < 0) | |
6518 goto onError; | |
6519 } | |
6520 | |
6521 Py_XDECREF(errorHandler); | |
6522 Py_XDECREF(exc); | |
6523 return res; | |
6524 | |
6525 onError: | |
6526 Py_XDECREF(res); | |
6527 Py_XDECREF(errorHandler); | |
6528 Py_XDECREF(exc); | |
6529 return NULL; | |
6530 } | |
6531 | |
6532 /* Deprecated */ | |
6533 PyObject * | |
6534 PyUnicode_EncodeLatin1(const Py_UNICODE *p, | |
6535 Py_ssize_t size, | |
6536 const char *errors) | |
6537 { | |
6538 PyObject *result; | |
6539 PyObject *unicode = PyUnicode_FromUnicode(p, size); | |
6540 if (unicode == NULL) | |
6541 return NULL; | |
6542 result = unicode_encode_ucs1(unicode, errors, 256); | |
6543 Py_DECREF(unicode); | |
6544 return result; | |
6545 } | |
6546 | |
6547 PyObject * | |
6548 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) | |
6549 { | |
6550 if (!PyUnicode_Check(unicode)) { | |
6551 PyErr_BadArgument(); | |
6552 return NULL; | |
6553 } | |
6554 if (PyUnicode_READY(unicode) == -1) | |
6555 return NULL; | |
6556 /* Fast path: if it is a one-byte string, construct | |
6557 bytes object directly. */ | |
6558 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) | |
6559 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), | |
6560 PyUnicode_GET_LENGTH(unicode)); | |
6561 /* Non-Latin-1 characters present. Defer to above function to | |
6562 raise the exception. */ | |
6563 return unicode_encode_ucs1(unicode, errors, 256); | |
6564 } | |
6565 | |
6566 PyObject* | |
6567 PyUnicode_AsLatin1String(PyObject *unicode) | |
6568 { | |
6569 return _PyUnicode_AsLatin1String(unicode, NULL); | |
6570 } | |
6571 | |
6572 /* --- 7-bit ASCII Codec -------------------------------------------------- */ | |
6573 | |
6574 PyObject * | |
6575 PyUnicode_DecodeASCII(const char *s, | |
6576 Py_ssize_t size, | |
6577 const char *errors) | |
6578 { | |
6579 const char *starts = s; | |
6580 PyObject *v; | |
6581 int kind; | |
6582 void *data; | |
6583 Py_ssize_t startinpos; | |
6584 Py_ssize_t endinpos; | |
6585 Py_ssize_t outpos; | |
6586 const char *e; | |
6587 int has_error; | |
6588 const unsigned char *p = (const unsigned char *)s; | |
6589 const unsigned char *end = p + size; | |
6590 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); | |
6591 PyObject *errorHandler = NULL; | |
6592 PyObject *exc = NULL; | |
6593 | |
6594 if (size == 0) { | |
6595 Py_INCREF(unicode_empty); | |
6596 return unicode_empty; | |
6597 } | |
6598 | |
6599 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ | |
6600 if (size == 1 && (unsigned char)s[0] < 128) | |
6601 return get_latin1_char((unsigned char)s[0]); | |
6602 | |
6603 has_error = 0; | |
6604 while (p < end && !has_error) { | |
6605 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for | |
6606 an explanation. */ | |
6607 if (!((size_t) p & LONG_PTR_MASK)) { | |
6608 /* Help register allocation */ | |
6609 register const unsigned char *_p = p; | |
6610 while (_p < aligned_end) { | |
6611 unsigned long value = *(unsigned long *) _p; | |
6612 if (value & ASCII_CHAR_MASK) { | |
6613 has_error = 1; | |
6614 break; | |
6615 } | |
6616 _p += SIZEOF_LONG; | |
6617 } | |
6618 if (_p == end) | |
6619 break; | |
6620 if (has_error) | |
6621 break; | |
6622 p = _p; | |
6623 } | |
6624 if (*p & 0x80) { | |
6625 has_error = 1; | |
6626 break; | |
6627 } | |
6628 else { | |
6629 ++p; | |
6630 } | |
6631 } | |
6632 if (!has_error) | |
6633 return unicode_fromascii((const unsigned char *)s, size); | |
6634 | |
6635 v = PyUnicode_New(size, 127); | |
6636 if (v == NULL) | |
6637 goto onError; | |
6638 if (size == 0) | |
6639 return v; | |
6640 kind = PyUnicode_KIND(v); | |
6641 data = PyUnicode_DATA(v); | |
6642 outpos = 0; | |
6643 e = s + size; | |
6644 while (s < e) { | |
6645 register unsigned char c = (unsigned char)*s; | |
6646 if (c < 128) { | |
6647 PyUnicode_WRITE(kind, data, outpos++, c); | |
6648 ++s; | |
6649 } | |
6650 else { | |
6651 startinpos = s-starts; | |
6652 endinpos = startinpos + 1; | |
6653 if (unicode_decode_call_errorhandler( | |
6654 errors, &errorHandler, | |
6655 "ascii", "ordinal not in range(128)", | |
6656 &starts, &e, &startinpos, &endinpos, &exc, &s, | |
6657 &v, &outpos)) | |
6658 goto onError; | |
6659 kind = PyUnicode_KIND(v); | |
6660 data = PyUnicode_DATA(v); | |
6661 } | |
6662 } | |
6663 if (PyUnicode_Resize(&v, outpos) < 0) | |
6664 goto onError; | |
6665 Py_XDECREF(errorHandler); | |
6666 Py_XDECREF(exc); | |
6667 assert(_PyUnicode_CheckConsistency(v, 1)); | |
6668 return v; | |
6669 | |
6670 onError: | |
6671 Py_XDECREF(v); | |
6672 Py_XDECREF(errorHandler); | |
6673 Py_XDECREF(exc); | |
6674 return NULL; | |
6675 } | |
6676 | |
6677 /* Deprecated */ | |
6678 PyObject * | |
6679 PyUnicode_EncodeASCII(const Py_UNICODE *p, | |
6680 Py_ssize_t size, | |
6681 const char *errors) | |
6682 { | |
6683 PyObject *result; | |
6684 PyObject *unicode = PyUnicode_FromUnicode(p, size); | |
6685 if (unicode == NULL) | |
6686 return NULL; | |
6687 result = unicode_encode_ucs1(unicode, errors, 128); | |
6688 Py_DECREF(unicode); | |
6689 return result; | |
6690 } | |
6691 | |
6692 PyObject * | |
6693 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) | |
6694 { | |
6695 if (!PyUnicode_Check(unicode)) { | |
6696 PyErr_BadArgument(); | |
6697 return NULL; | |
6698 } | |
6699 if (PyUnicode_READY(unicode) == -1) | |
6700 return NULL; | |
6701 /* Fast path: if it is an ASCII-only string, construct bytes object | |
6702 directly. Else defer to above function to raise the exception. */ | |
6703 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) | |
6704 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), | |
6705 PyUnicode_GET_LENGTH(unicode)); | |
6706 return unicode_encode_ucs1(unicode, errors, 128); | |
6707 } | |
6708 | |
6709 PyObject * | |
6710 PyUnicode_AsASCIIString(PyObject *unicode) | |
6711 { | |
6712 return _PyUnicode_AsASCIIString(unicode, NULL); | |
6713 } | |
6714 | |
6715 #ifdef HAVE_MBCS | |
6716 | |
6717 /* --- MBCS codecs for Windows -------------------------------------------- */ | |
6718 | |
6719 #if SIZEOF_INT < SIZEOF_SIZE_T | |
6720 #define NEED_RETRY | |
6721 #endif | |
6722 | |
6723 #ifndef WC_ERR_INVALID_CHARS | |
6724 # define WC_ERR_INVALID_CHARS 0x0080 | |
6725 #endif | |
6726 | |
6727 static char* | |
6728 code_page_name(UINT code_page, PyObject **obj) | |
6729 { | |
6730 *obj = NULL; | |
6731 if (code_page == CP_ACP) | |
6732 return "mbcs"; | |
6733 if (code_page == CP_UTF7) | |
6734 return "CP_UTF7"; | |
6735 if (code_page == CP_UTF8) | |
6736 return "CP_UTF8"; | |
6737 | |
6738 *obj = PyBytes_FromFormat("cp%u", code_page); | |
6739 if (*obj == NULL) | |
6740 return NULL; | |
6741 return PyBytes_AS_STRING(*obj); | |
6742 } | |
6743 | |
6744 static int | |
6745 is_dbcs_lead_byte(UINT code_page, const char *s, int offset) | |
6746 { | |
6747 const char *curr = s + offset; | |
6748 const char *prev; | |
6749 | |
6750 if (!IsDBCSLeadByteEx(code_page, *curr)) | |
6751 return 0; | |
6752 | |
6753 prev = CharPrevExA(code_page, s, curr, 0); | |
6754 if (prev == curr) | |
6755 return 1; | |
6756 /* FIXME: This code is limited to "true" double-byte encodings, | |
6757 as it assumes an incomplete character consists of a single | |
6758 byte. */ | |
6759 if (curr - prev == 2) | |
6760 return 1; | |
6761 if (!IsDBCSLeadByteEx(code_page, *prev)) | |
6762 return 1; | |
6763 return 0; | |
6764 } | |
6765 | |
6766 static DWORD | |
6767 decode_code_page_flags(UINT code_page) | |
6768 { | |
6769 if (code_page == CP_UTF7) { | |
6770 /* The CP_UTF7 decoder only supports flags=0 */ | |
6771 return 0; | |
6772 } | |
6773 else | |
6774 return MB_ERR_INVALID_CHARS; | |
6775 } | |
6776 | |
6777 /* | |
6778 * Decode a byte string from a Windows code page into unicode object in strict | |
6779 * mode. | |
6780 * | |
6781 * Returns consumed size if succeed, returns -2 on decode error, or raise a | |
6782 * WindowsError and returns -1 on other error. | |
6783 */ | |
6784 static int | |
6785 decode_code_page_strict(UINT code_page, | |
6786 PyObject **v, | |
6787 const char *in, | |
6788 int insize) | |
6789 { | |
6790 const DWORD flags = decode_code_page_flags(code_page); | |
6791 wchar_t *out; | |
6792 DWORD outsize; | |
6793 | |
6794 /* First get the size of the result */ | |
6795 assert(insize > 0); | |
6796 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); | |
6797 if (outsize <= 0) | |
6798 goto error; | |
6799 | |
6800 if (*v == NULL) { | |
6801 /* Create unicode object */ | |
6802 *v = (PyObject*)_PyUnicode_New(outsize); | |
6803 if (*v == NULL) | |
6804 return -1; | |
6805 out = PyUnicode_AS_UNICODE(*v); | |
6806 } | |
6807 else { | |
6808 /* Extend unicode object */ | |
6809 Py_ssize_t n = PyUnicode_GET_SIZE(*v); | |
6810 if (PyUnicode_Resize(v, n + outsize) < 0) | |
6811 return -1; | |
6812 out = PyUnicode_AS_UNICODE(*v) + n; | |
6813 } | |
6814 | |
6815 /* Do the conversion */ | |
6816 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); | |
6817 if (outsize <= 0) | |
6818 goto error; | |
6819 return insize; | |
6820 | |
6821 error: | |
6822 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) | |
6823 return -2; | |
6824 PyErr_SetFromWindowsErr(0); | |
6825 return -1; | |
6826 } | |
6827 | |
6828 /* | |
6829 * Decode a byte string from a code page into unicode object with an error | |
6830 * handler. | |
6831 * | |
6832 * Returns consumed size if succeed, or raise a WindowsError or | |
6833 * UnicodeDecodeError exception and returns -1 on error. | |
6834 */ | |
6835 static int | |
6836 decode_code_page_errors(UINT code_page, | |
6837 PyObject **v, | |
6838 const char *in, const int size, | |
6839 const char *errors) | |
6840 { | |
6841 const char *startin = in; | |
6842 const char *endin = in + size; | |
6843 const DWORD flags = decode_code_page_flags(code_page); | |
6844 /* Ideally, we should get reason from FormatMessage. This is the Windows | |
6845 2000 English version of the message. */ | |
6846 const char *reason = "No mapping for the Unicode character exists " | |
6847 "in the target code page."; | |
6848 /* each step cannot decode more than 1 character, but a character can be | |
6849 represented as a surrogate pair */ | |
6850 wchar_t buffer[2], *startout, *out; | |
6851 int insize, outsize; | |
6852 PyObject *errorHandler = NULL; | |
6853 PyObject *exc = NULL; | |
6854 PyObject *encoding_obj = NULL; | |
6855 char *encoding; | |
6856 DWORD err; | |
6857 int ret = -1; | |
6858 | |
6859 assert(size > 0); | |
6860 | |
6861 encoding = code_page_name(code_page, &encoding_obj); | |
6862 if (encoding == NULL) | |
6863 return -1; | |
6864 | |
6865 if (errors == NULL || strcmp(errors, "strict") == 0) { | |
6866 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a | |
6867 UnicodeDecodeError. */ | |
6868 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); | |
6869 if (exc != NULL) { | |
6870 PyCodec_StrictErrors(exc); | |
6871 Py_CLEAR(exc); | |
6872 } | |
6873 goto error; | |
6874 } | |
6875 | |
6876 if (*v == NULL) { | |
6877 /* Create unicode object */ | |
6878 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { | |
6879 PyErr_NoMemory(); | |
6880 goto error; | |
6881 } | |
6882 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); | |
6883 if (*v == NULL) | |
6884 goto error; | |
6885 startout = PyUnicode_AS_UNICODE(*v); | |
6886 } | |
6887 else { | |
6888 /* Extend unicode object */ | |
6889 Py_ssize_t n = PyUnicode_GET_SIZE(*v); | |
6890 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { | |
6891 PyErr_NoMemory(); | |
6892 goto error; | |
6893 } | |
6894 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) | |
6895 goto error; | |
6896 startout = PyUnicode_AS_UNICODE(*v) + n; | |
6897 } | |
6898 | |
6899 /* Decode the byte string character per character */ | |
6900 out = startout; | |
6901 while (in < endin) | |
6902 { | |
6903 /* Decode a character */ | |
6904 insize = 1; | |
6905 do | |
6906 { | |
6907 outsize = MultiByteToWideChar(code_page, flags, | |
6908 in, insize, | |
6909 buffer, Py_ARRAY_LENGTH(buffer)); | |
6910 if (outsize > 0) | |
6911 break; | |
6912 err = GetLastError(); | |
6913 if (err != ERROR_NO_UNICODE_TRANSLATION | |
6914 && err != ERROR_INSUFFICIENT_BUFFER) | |
6915 { | |
6916 PyErr_SetFromWindowsErr(0); | |
6917 goto error; | |
6918 } | |
6919 insize++; | |
6920 } | |
6921 /* 4=maximum length of a UTF-8 sequence */ | |
6922 while (insize <= 4 && (in + insize) <= endin); | |
6923 | |
6924 if (outsize <= 0) { | |
6925 Py_ssize_t startinpos, endinpos, outpos; | |
6926 | |
6927 startinpos = in - startin; | |
6928 endinpos = startinpos + 1; | |
6929 outpos = out - PyUnicode_AS_UNICODE(*v); | |
6930 if (unicode_decode_call_errorhandler( | |
6931 errors, &errorHandler, | |
6932 encoding, reason, | |
6933 &startin, &endin, &startinpos, &endinpos, &exc, &in, | |
6934 v, &outpos)) | |
6935 { | |
6936 goto error; | |
6937 } | |
6938 out = PyUnicode_AS_UNICODE(*v) + outpos; | |
6939 } | |
6940 else { | |
6941 in += insize; | |
6942 memcpy(out, buffer, outsize * sizeof(wchar_t)); | |
6943 out += outsize; | |
6944 } | |
6945 } | |
6946 | |
6947 /* write a NUL character at the end */ | |
6948 *out = 0; | |
6949 | |
6950 /* Extend unicode object */ | |
6951 outsize = out - startout; | |
6952 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); | |
6953 if (PyUnicode_Resize(v, outsize) < 0) | |
6954 goto error; | |
6955 ret = size; | |
6956 | |
6957 error: | |
6958 Py_XDECREF(encoding_obj); | |
6959 Py_XDECREF(errorHandler); | |
6960 Py_XDECREF(exc); | |
6961 return ret; | |
6962 } | |
6963 | |
6964 static PyObject * | |
6965 decode_code_page_stateful(int code_page, | |
6966 const char *s, Py_ssize_t size, | |
6967 const char *errors, Py_ssize_t *consumed) | |
6968 { | |
6969 PyObject *v = NULL; | |
6970 int chunk_size, final, converted, done; | |
6971 | |
6972 if (code_page < 0) { | |
6973 PyErr_SetString(PyExc_ValueError, "invalid code page number"); | |
6974 return NULL; | |
6975 } | |
6976 | |
6977 if (consumed) | |
6978 *consumed = 0; | |
6979 | |
6980 do | |
6981 { | |
6982 #ifdef NEED_RETRY | |
6983 if (size > INT_MAX) { | |
6984 chunk_size = INT_MAX; | |
6985 final = 0; | |
6986 done = 0; | |
6987 } | |
6988 else | |
6989 #endif | |
6990 { | |
6991 chunk_size = (int)size; | |
6992 final = (consumed == NULL); | |
6993 done = 1; | |
6994 } | |
6995 | |
6996 /* Skip trailing lead-byte unless 'final' is set */ | |
6997 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) | |
6998 --chunk_size; | |
6999 | |
7000 if (chunk_size == 0 && done) { | |
7001 if (v != NULL) | |
7002 break; | |
7003 Py_INCREF(unicode_empty); | |
7004 return unicode_empty; | |
7005 } | |
7006 | |
7007 | |
7008 converted = decode_code_page_strict(code_page, &v, | |
7009 s, chunk_size); | |
7010 if (converted == -2) | |
7011 converted = decode_code_page_errors(code_page, &v, | |
7012 s, chunk_size, | |
7013 errors); | |
7014 assert(converted != 0); | |
7015 | |
7016 if (converted < 0) { | |
7017 Py_XDECREF(v); | |
7018 return NULL; | |
7019 } | |
7020 | |
7021 if (consumed) | |
7022 *consumed += converted; | |
7023 | |
7024 s += converted; | |
7025 size -= converted; | |
7026 } while (!done); | |
7027 | |
7028 return unicode_result(v); | |
7029 } | |
7030 | |
7031 PyObject * | |
7032 PyUnicode_DecodeCodePageStateful(int code_page, | |
7033 const char *s, | |
7034 Py_ssize_t size, | |
7035 const char *errors, | |
7036 Py_ssize_t *consumed) | |
7037 { | |
7038 return decode_code_page_stateful(code_page, s, size, errors, consumed); | |
7039 } | |
7040 | |
7041 PyObject * | |
7042 PyUnicode_DecodeMBCSStateful(const char *s, | |
7043 Py_ssize_t size, | |
7044 const char *errors, | |
7045 Py_ssize_t *consumed) | |
7046 { | |
7047 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); | |
7048 } | |
7049 | |
7050 PyObject * | |
7051 PyUnicode_DecodeMBCS(const char *s, | |
7052 Py_ssize_t size, | |
7053 const char *errors) | |
7054 { | |
7055 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); | |
7056 } | |
7057 | |
7058 static DWORD | |
7059 encode_code_page_flags(UINT code_page, const char *errors) | |
7060 { | |
7061 if (code_page == CP_UTF8) { | |
7062 if (winver.dwMajorVersion >= 6) | |
7063 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista | |
7064 and later */ | |
7065 return WC_ERR_INVALID_CHARS; | |
7066 else | |
7067 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ | |
7068 return 0; | |
7069 } | |
7070 else if (code_page == CP_UTF7) { | |
7071 /* CP_UTF7 only supports flags=0 */ | |
7072 return 0; | |
7073 } | |
7074 else { | |
7075 if (errors != NULL && strcmp(errors, "replace") == 0) | |
7076 return 0; | |
7077 else | |
7078 return WC_NO_BEST_FIT_CHARS; | |
7079 } | |
7080 } | |
7081 | |
7082 /* | |
7083 * Encode a Unicode string to a Windows code page into a byte string in strict | |
7084 * mode. | |
7085 * | |
7086 * Returns consumed characters if succeed, returns -2 on encode error, or raise | |
7087 * a WindowsError and returns -1 on other error. | |
7088 */ | |
7089 static int | |
7090 encode_code_page_strict(UINT code_page, PyObject **outbytes, | |
7091 PyObject *unicode, Py_ssize_t offset, int len, | |
7092 const char* errors) | |
7093 { | |
7094 BOOL usedDefaultChar = FALSE; | |
7095 BOOL *pusedDefaultChar = &usedDefaultChar; | |
7096 int outsize; | |
7097 PyObject *exc = NULL; | |
7098 wchar_t *p; | |
7099 Py_ssize_t size; | |
7100 const DWORD flags = encode_code_page_flags(code_page, NULL); | |
7101 char *out; | |
7102 /* Create a substring so that we can get the UTF-16 representation | |
7103 of just the slice under consideration. */ | |
7104 PyObject *substring; | |
7105 | |
7106 assert(len > 0); | |
7107 | |
7108 if (code_page != CP_UTF8 && code_page != CP_UTF7) | |
7109 pusedDefaultChar = &usedDefaultChar; | |
7110 else | |
7111 pusedDefaultChar = NULL; | |
7112 | |
7113 substring = PyUnicode_Substring(unicode, offset, offset+len); | |
7114 if (substring == NULL) | |
7115 return -1; | |
7116 p = PyUnicode_AsUnicodeAndSize(substring, &size); | |
7117 if (p == NULL) { | |
7118 Py_DECREF(substring); | |
7119 return -1; | |
7120 } | |
7121 | |
7122 /* First get the size of the result */ | |
7123 outsize = WideCharToMultiByte(code_page, flags, | |
7124 p, size, | |
7125 NULL, 0, | |
7126 NULL, pusedDefaultChar); | |
7127 if (outsize <= 0) | |
7128 goto error; | |
7129 /* If we used a default char, then we failed! */ | |
7130 if (pusedDefaultChar && *pusedDefaultChar) { | |
7131 Py_DECREF(substring); | |
7132 return -2; | |
7133 } | |
7134 | |
7135 if (*outbytes == NULL) { | |
7136 /* Create string object */ | |
7137 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); | |
7138 if (*outbytes == NULL) { | |
7139 Py_DECREF(substring); | |
7140 return -1; | |
7141 } | |
7142 out = PyBytes_AS_STRING(*outbytes); | |
7143 } | |
7144 else { | |
7145 /* Extend string object */ | |
7146 const Py_ssize_t n = PyBytes_Size(*outbytes); | |
7147 if (outsize > PY_SSIZE_T_MAX - n) { | |
7148 PyErr_NoMemory(); | |
7149 Py_DECREF(substring); | |
7150 return -1; | |
7151 } | |
7152 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { | |
7153 Py_DECREF(substring); | |
7154 return -1; | |
7155 } | |
7156 out = PyBytes_AS_STRING(*outbytes) + n; | |
7157 } | |
7158 | |
7159 /* Do the conversion */ | |
7160 outsize = WideCharToMultiByte(code_page, flags, | |
7161 p, size, | |
7162 out, outsize, | |
7163 NULL, pusedDefaultChar); | |
7164 Py_CLEAR(substring); | |
7165 if (outsize <= 0) | |
7166 goto error; | |
7167 if (pusedDefaultChar && *pusedDefaultChar) | |
7168 return -2; | |
7169 return 0; | |
7170 | |
7171 error: | |
7172 Py_XDECREF(substring); | |
7173 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) | |
7174 return -2; | |
7175 PyErr_SetFromWindowsErr(0); | |
7176 return -1; | |
7177 } | |
7178 | |
7179 /* | |
7180 * Encode a Unicode string to a Windows code page into a byte string using a | |
7181 * error handler. | |
7182 * | |
7183 * Returns consumed characters if succeed, or raise a WindowsError and returns | |
7184 * -1 on other error. | |
7185 */ | |
7186 static int | |
7187 encode_code_page_errors(UINT code_page, PyObject **outbytes, | |
7188 PyObject *unicode, Py_ssize_t unicode_offset, | |
7189 Py_ssize_t insize, const char* errors) | |
7190 { | |
7191 const DWORD flags = encode_code_page_flags(code_page, errors); | |
7192 Py_ssize_t pos = unicode_offset; | |
7193 Py_ssize_t endin = unicode_offset + insize; | |
7194 /* Ideally, we should get reason from FormatMessage. This is the Windows | |
7195 2000 English version of the message. */ | |
7196 const char *reason = "invalid character"; | |
7197 /* 4=maximum length of a UTF-8 sequence */ | |
7198 char buffer[4]; | |
7199 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; | |
7200 Py_ssize_t outsize; | |
7201 char *out; | |
7202 PyObject *errorHandler = NULL; | |
7203 PyObject *exc = NULL; | |
7204 PyObject *encoding_obj = NULL; | |
7205 char *encoding; | |
7206 Py_ssize_t newpos, newoutsize; | |
7207 PyObject *rep; | |
7208 int ret = -1; | |
7209 | |
7210 assert(insize > 0); | |
7211 | |
7212 encoding = code_page_name(code_page, &encoding_obj); | |
7213 if (encoding == NULL) | |
7214 return -1; | |
7215 | |
7216 if (errors == NULL || strcmp(errors, "strict") == 0) { | |
7217 /* The last error was ERROR_NO_UNICODE_TRANSLATION, | |
7218 then we raise a UnicodeEncodeError. */ | |
7219 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); | |
7220 if (exc != NULL) { | |
7221 PyCodec_StrictErrors(exc); | |
7222 Py_DECREF(exc); | |
7223 } | |
7224 Py_XDECREF(encoding_obj); | |
7225 return -1; | |
7226 } | |
7227 | |
7228 if (code_page != CP_UTF8 && code_page != CP_UTF7) | |
7229 pusedDefaultChar = &usedDefaultChar; | |
7230 else | |
7231 pusedDefaultChar = NULL; | |
7232 | |
7233 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { | |
7234 PyErr_NoMemory(); | |
7235 goto error; | |
7236 } | |
7237 outsize = insize * Py_ARRAY_LENGTH(buffer); | |
7238 | |
7239 if (*outbytes == NULL) { | |
7240 /* Create string object */ | |
7241 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); | |
7242 if (*outbytes == NULL) | |
7243 goto error; | |
7244 out = PyBytes_AS_STRING(*outbytes); | |
7245 } | |
7246 else { | |
7247 /* Extend string object */ | |
7248 Py_ssize_t n = PyBytes_Size(*outbytes); | |
7249 if (n > PY_SSIZE_T_MAX - outsize) { | |
7250 PyErr_NoMemory(); | |
7251 goto error; | |
7252 } | |
7253 if (_PyBytes_Resize(outbytes, n + outsize) < 0) | |
7254 goto error; | |
7255 out = PyBytes_AS_STRING(*outbytes) + n; | |
7256 } | |
7257 | |
7258 /* Encode the string character per character */ | |
7259 while (pos < endin) | |
7260 { | |
7261 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); | |
7262 wchar_t chars[2]; | |
7263 int charsize; | |
7264 if (ch < 0x10000) { | |
7265 chars[0] = (wchar_t)ch; | |
7266 charsize = 1; | |
7267 } | |
7268 else { | |
7269 ch -= 0x10000; | |
7270 chars[0] = 0xd800 + (ch >> 10); | |
7271 chars[1] = 0xdc00 + (ch & 0x3ff); | |
7272 charsize = 2; | |
7273 } | |
7274 | |
7275 outsize = WideCharToMultiByte(code_page, flags, | |
7276 chars, charsize, | |
7277 buffer, Py_ARRAY_LENGTH(buffer), | |
7278 NULL, pusedDefaultChar); | |
7279 if (outsize > 0) { | |
7280 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) | |
7281 { | |
7282 pos++; | |
7283 memcpy(out, buffer, outsize); | |
7284 out += outsize; | |
7285 continue; | |
7286 } | |
7287 } | |
7288 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { | |
7289 PyErr_SetFromWindowsErr(0); | |
7290 goto error; | |
7291 } | |
7292 | |
7293 rep = unicode_encode_call_errorhandler( | |
7294 errors, &errorHandler, encoding, reason, | |
7295 unicode, &exc, | |
7296 pos, pos + 1, &newpos); | |
7297 if (rep == NULL) | |
7298 goto error; | |
7299 pos = newpos; | |
7300 | |
7301 if (PyBytes_Check(rep)) { | |
7302 outsize = PyBytes_GET_SIZE(rep); | |
7303 if (outsize != 1) { | |
7304 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); | |
7305 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); | |
7306 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { | |
7307 Py_DECREF(rep); | |
7308 goto error; | |
7309 } | |
7310 out = PyBytes_AS_STRING(*outbytes) + offset; | |
7311 } | |
7312 memcpy(out, PyBytes_AS_STRING(rep), outsize); | |
7313 out += outsize; | |
7314 } | |
7315 else { | |
7316 Py_ssize_t i; | |
7317 enum PyUnicode_Kind kind; | |
7318 void *data; | |
7319 | |
7320 if (PyUnicode_READY(rep) < 0) { | |
7321 Py_DECREF(rep); | |
7322 goto error; | |
7323 } | |
7324 | |
7325 outsize = PyUnicode_GET_LENGTH(rep); | |
7326 if (outsize != 1) { | |
7327 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); | |
7328 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); | |
7329 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { | |
7330 Py_DECREF(rep); | |
7331 goto error; | |
7332 } | |
7333 out = PyBytes_AS_STRING(*outbytes) + offset; | |
7334 } | |
7335 kind = PyUnicode_KIND(rep); | |
7336 data = PyUnicode_DATA(rep); | |
7337 for (i=0; i < outsize; i++) { | |
7338 Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
7339 if (ch > 127) { | |
7340 raise_encode_exception(&exc, | |
7341 encoding, unicode, | |
7342 pos, pos + 1, | |
7343 "unable to encode error handler result to ASCII"); | |
7344 Py_DECREF(rep); | |
7345 goto error; | |
7346 } | |
7347 *out = (unsigned char)ch; | |
7348 out++; | |
7349 } | |
7350 } | |
7351 Py_DECREF(rep); | |
7352 } | |
7353 /* write a NUL byte */ | |
7354 *out = 0; | |
7355 outsize = out - PyBytes_AS_STRING(*outbytes); | |
7356 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); | |
7357 if (_PyBytes_Resize(outbytes, outsize) < 0) | |
7358 goto error; | |
7359 ret = 0; | |
7360 | |
7361 error: | |
7362 Py_XDECREF(encoding_obj); | |
7363 Py_XDECREF(errorHandler); | |
7364 Py_XDECREF(exc); | |
7365 return ret; | |
7366 } | |
7367 | |
7368 static PyObject * | |
7369 encode_code_page(int code_page, | |
7370 PyObject *unicode, | |
7371 const char *errors) | |
7372 { | |
7373 Py_ssize_t len; | |
7374 PyObject *outbytes = NULL; | |
7375 Py_ssize_t offset; | |
7376 int chunk_len, ret, done; | |
7377 | |
7378 if (PyUnicode_READY(unicode) < 0) | |
7379 return NULL; | |
7380 len = PyUnicode_GET_LENGTH(unicode); | |
7381 | |
7382 if (code_page < 0) { | |
7383 PyErr_SetString(PyExc_ValueError, "invalid code page number"); | |
7384 return NULL; | |
7385 } | |
7386 | |
7387 if (len == 0) | |
7388 return PyBytes_FromStringAndSize(NULL, 0); | |
7389 | |
7390 offset = 0; | |
7391 do | |
7392 { | |
7393 #ifdef NEED_RETRY | |
7394 /* UTF-16 encoding may double the size, so use only INT_MAX/2 | |
7395 chunks. */ | |
7396 if (len > INT_MAX/2) { | |
7397 chunk_len = INT_MAX/2; | |
7398 done = 0; | |
7399 } | |
7400 else | |
7401 #endif | |
7402 { | |
7403 chunk_len = (int)len; | |
7404 done = 1; | |
7405 } | |
7406 | |
7407 ret = encode_code_page_strict(code_page, &outbytes, | |
7408 unicode, offset, chunk_len, | |
7409 errors); | |
7410 if (ret == -2) | |
7411 ret = encode_code_page_errors(code_page, &outbytes, | |
7412 unicode, offset, | |
7413 chunk_len, errors); | |
7414 if (ret < 0) { | |
7415 Py_XDECREF(outbytes); | |
7416 return NULL; | |
7417 } | |
7418 | |
7419 offset += chunk_len; | |
7420 len -= chunk_len; | |
7421 } while (!done); | |
7422 | |
7423 return outbytes; | |
7424 } | |
7425 | |
7426 PyObject * | |
7427 PyUnicode_EncodeMBCS(const Py_UNICODE *p, | |
7428 Py_ssize_t size, | |
7429 const char *errors) | |
7430 { | |
7431 PyObject *unicode, *res; | |
7432 unicode = PyUnicode_FromUnicode(p, size); | |
7433 if (unicode == NULL) | |
7434 return NULL; | |
7435 res = encode_code_page(CP_ACP, unicode, errors); | |
7436 Py_DECREF(unicode); | |
7437 return res; | |
7438 } | |
7439 | |
7440 PyObject * | |
7441 PyUnicode_EncodeCodePage(int code_page, | |
7442 PyObject *unicode, | |
7443 const char *errors) | |
7444 { | |
7445 return encode_code_page(code_page, unicode, errors); | |
7446 } | |
7447 | |
7448 PyObject * | |
7449 PyUnicode_AsMBCSString(PyObject *unicode) | |
7450 { | |
7451 if (!PyUnicode_Check(unicode)) { | |
7452 PyErr_BadArgument(); | |
7453 return NULL; | |
7454 } | |
7455 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); | |
7456 } | |
7457 | |
7458 #undef NEED_RETRY | |
7459 | |
7460 #endif /* HAVE_MBCS */ | |
7461 | |
7462 /* --- Character Mapping Codec -------------------------------------------- */ | |
7463 | |
7464 PyObject * | |
7465 PyUnicode_DecodeCharmap(const char *s, | |
7466 Py_ssize_t size, | |
7467 PyObject *mapping, | |
7468 const char *errors) | |
7469 { | |
7470 const char *starts = s; | |
7471 Py_ssize_t startinpos; | |
7472 Py_ssize_t endinpos; | |
7473 Py_ssize_t outpos; | |
7474 const char *e; | |
7475 PyObject *v; | |
7476 Py_ssize_t extrachars = 0; | |
7477 PyObject *errorHandler = NULL; | |
7478 PyObject *exc = NULL; | |
7479 | |
7480 /* Default to Latin-1 */ | |
7481 if (mapping == NULL) | |
7482 return PyUnicode_DecodeLatin1(s, size, errors); | |
7483 | |
7484 v = PyUnicode_New(size, 127); | |
7485 if (v == NULL) | |
7486 goto onError; | |
7487 if (size == 0) | |
7488 return v; | |
7489 outpos = 0; | |
7490 e = s + size; | |
7491 if (PyUnicode_CheckExact(mapping)) { | |
7492 Py_ssize_t maplen; | |
7493 enum PyUnicode_Kind kind; | |
7494 void *data; | |
7495 Py_UCS4 x; | |
7496 | |
7497 if (PyUnicode_READY(mapping) < 0) | |
7498 return NULL; | |
7499 | |
7500 maplen = PyUnicode_GET_LENGTH(mapping); | |
7501 data = PyUnicode_DATA(mapping); | |
7502 kind = PyUnicode_KIND(mapping); | |
7503 while (s < e) { | |
7504 unsigned char ch = *s; | |
7505 | |
7506 if (ch < maplen) | |
7507 x = PyUnicode_READ(kind, data, ch); | |
7508 else | |
7509 x = 0xfffe; /* invalid value */ | |
7510 | |
7511 if (x == 0xfffe) | |
7512 { | |
7513 /* undefined mapping */ | |
7514 startinpos = s-starts; | |
7515 endinpos = startinpos+1; | |
7516 if (unicode_decode_call_errorhandler( | |
7517 errors, &errorHandler, | |
7518 "charmap", "character maps to <undefined>", | |
7519 &starts, &e, &startinpos, &endinpos, &exc, &s, | |
7520 &v, &outpos)) { | |
7521 goto onError; | |
7522 } | |
7523 continue; | |
7524 } | |
7525 | |
7526 if (unicode_putchar(&v, &outpos, x) < 0) | |
7527 goto onError; | |
7528 ++s; | |
7529 } | |
7530 } | |
7531 else { | |
7532 while (s < e) { | |
7533 unsigned char ch = *s; | |
7534 PyObject *w, *x; | |
7535 | |
7536 /* Get mapping (char ordinal -> integer, Unicode char or None) */ | |
7537 w = PyLong_FromLong((long)ch); | |
7538 if (w == NULL) | |
7539 goto onError; | |
7540 x = PyObject_GetItem(mapping, w); | |
7541 Py_DECREF(w); | |
7542 if (x == NULL) { | |
7543 if (PyErr_ExceptionMatches(PyExc_LookupError)) { | |
7544 /* No mapping found means: mapping is undefined. */ | |
7545 PyErr_Clear(); | |
7546 x = Py_None; | |
7547 Py_INCREF(x); | |
7548 } else | |
7549 goto onError; | |
7550 } | |
7551 | |
7552 /* Apply mapping */ | |
7553 if (PyLong_Check(x)) { | |
7554 long value = PyLong_AS_LONG(x); | |
7555 if (value < 0 || value > 65535) { | |
7556 PyErr_SetString(PyExc_TypeError, | |
7557 "character mapping must be in range(65536)"); | |
7558 Py_DECREF(x); | |
7559 goto onError; | |
7560 } | |
7561 if (unicode_putchar(&v, &outpos, value) < 0) | |
7562 goto onError; | |
7563 } | |
7564 else if (x == Py_None) { | |
7565 /* undefined mapping */ | |
7566 startinpos = s-starts; | |
7567 endinpos = startinpos+1; | |
7568 if (unicode_decode_call_errorhandler( | |
7569 errors, &errorHandler, | |
7570 "charmap", "character maps to <undefined>", | |
7571 &starts, &e, &startinpos, &endinpos, &exc, &s, | |
7572 &v, &outpos)) { | |
7573 Py_DECREF(x); | |
7574 goto onError; | |
7575 } | |
7576 Py_DECREF(x); | |
7577 continue; | |
7578 } | |
7579 else if (PyUnicode_Check(x)) { | |
7580 Py_ssize_t targetsize; | |
7581 | |
7582 if (PyUnicode_READY(x) < 0) | |
7583 goto onError; | |
7584 targetsize = PyUnicode_GET_LENGTH(x); | |
7585 | |
7586 if (targetsize == 1) { | |
7587 /* 1-1 mapping */ | |
7588 if (unicode_putchar(&v, &outpos, | |
7589 PyUnicode_READ_CHAR(x, 0)) < 0) | |
7590 goto onError; | |
7591 } | |
7592 else if (targetsize > 1) { | |
7593 /* 1-n mapping */ | |
7594 if (targetsize > extrachars) { | |
7595 /* resize first */ | |
7596 Py_ssize_t needed = (targetsize - extrachars) + \ | |
7597 (targetsize << 2); | |
7598 extrachars += needed; | |
7599 /* XXX overflow detection missing */ | |
7600 if (PyUnicode_Resize(&v, | |
7601 PyUnicode_GET_LENGTH(v) + needed) < 0) { | |
7602 Py_DECREF(x); | |
7603 goto onError; | |
7604 } | |
7605 } | |
7606 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0) | |
7607 goto onError; | |
7608 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize); | |
7609 outpos += targetsize; | |
7610 extrachars -= targetsize; | |
7611 } | |
7612 /* 1-0 mapping: skip the character */ | |
7613 } | |
7614 else { | |
7615 /* wrong return value */ | |
7616 PyErr_SetString(PyExc_TypeError, | |
7617 "character mapping must return integer, None or str"); | |
7618 Py_DECREF(x); | |
7619 goto onError; | |
7620 } | |
7621 Py_DECREF(x); | |
7622 ++s; | |
7623 } | |
7624 } | |
7625 if (PyUnicode_Resize(&v, outpos) < 0) | |
7626 goto onError; | |
7627 Py_XDECREF(errorHandler); | |
7628 Py_XDECREF(exc); | |
7629 return unicode_result(v); | |
7630 | |
7631 onError: | |
7632 Py_XDECREF(errorHandler); | |
7633 Py_XDECREF(exc); | |
7634 Py_XDECREF(v); | |
7635 return NULL; | |
7636 } | |
7637 | |
7638 /* Charmap encoding: the lookup table */ | |
7639 | |
7640 struct encoding_map { | |
7641 PyObject_HEAD | |
7642 unsigned char level1[32]; | |
7643 int count2, count3; | |
7644 unsigned char level23[1]; | |
7645 }; | |
7646 | |
7647 static PyObject* | |
7648 encoding_map_size(PyObject *obj, PyObject* args) | |
7649 { | |
7650 struct encoding_map *map = (struct encoding_map*)obj; | |
7651 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + | |
7652 128*map->count3); | |
7653 } | |
7654 | |
7655 static PyMethodDef encoding_map_methods[] = { | |
7656 {"size", encoding_map_size, METH_NOARGS, | |
7657 PyDoc_STR("Return the size (in bytes) of this object") }, | |
7658 { 0 } | |
7659 }; | |
7660 | |
7661 static void | |
7662 encoding_map_dealloc(PyObject* o) | |
7663 { | |
7664 PyObject_FREE(o); | |
7665 } | |
7666 | |
7667 static PyTypeObject EncodingMapType = { | |
7668 PyVarObject_HEAD_INIT(NULL, 0) | |
7669 "EncodingMap", /*tp_name*/ | |
7670 sizeof(struct encoding_map), /*tp_basicsize*/ | |
7671 0, /*tp_itemsize*/ | |
7672 /* methods */ | |
7673 encoding_map_dealloc, /*tp_dealloc*/ | |
7674 0, /*tp_print*/ | |
7675 0, /*tp_getattr*/ | |
7676 0, /*tp_setattr*/ | |
7677 0, /*tp_reserved*/ | |
7678 0, /*tp_repr*/ | |
7679 0, /*tp_as_number*/ | |
7680 0, /*tp_as_sequence*/ | |
7681 0, /*tp_as_mapping*/ | |
7682 0, /*tp_hash*/ | |
7683 0, /*tp_call*/ | |
7684 0, /*tp_str*/ | |
7685 0, /*tp_getattro*/ | |
7686 0, /*tp_setattro*/ | |
7687 0, /*tp_as_buffer*/ | |
7688 Py_TPFLAGS_DEFAULT, /*tp_flags*/ | |
7689 0, /*tp_doc*/ | |
7690 0, /*tp_traverse*/ | |
7691 0, /*tp_clear*/ | |
7692 0, /*tp_richcompare*/ | |
7693 0, /*tp_weaklistoffset*/ | |
7694 0, /*tp_iter*/ | |
7695 0, /*tp_iternext*/ | |
7696 encoding_map_methods, /*tp_methods*/ | |
7697 0, /*tp_members*/ | |
7698 0, /*tp_getset*/ | |
7699 0, /*tp_base*/ | |
7700 0, /*tp_dict*/ | |
7701 0, /*tp_descr_get*/ | |
7702 0, /*tp_descr_set*/ | |
7703 0, /*tp_dictoffset*/ | |
7704 0, /*tp_init*/ | |
7705 0, /*tp_alloc*/ | |
7706 0, /*tp_new*/ | |
7707 0, /*tp_free*/ | |
7708 0, /*tp_is_gc*/ | |
7709 }; | |
7710 | |
7711 PyObject* | |
7712 PyUnicode_BuildEncodingMap(PyObject* string) | |
7713 { | |
7714 PyObject *result; | |
7715 struct encoding_map *mresult; | |
7716 int i; | |
7717 int need_dict = 0; | |
7718 unsigned char level1[32]; | |
7719 unsigned char level2[512]; | |
7720 unsigned char *mlevel1, *mlevel2, *mlevel3; | |
7721 int count2 = 0, count3 = 0; | |
7722 int kind; | |
7723 void *data; | |
7724 Py_UCS4 ch; | |
7725 | |
7726 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { | |
7727 PyErr_BadArgument(); | |
7728 return NULL; | |
7729 } | |
7730 kind = PyUnicode_KIND(string); | |
7731 data = PyUnicode_DATA(string); | |
7732 memset(level1, 0xFF, sizeof level1); | |
7733 memset(level2, 0xFF, sizeof level2); | |
7734 | |
7735 /* If there isn't a one-to-one mapping of NULL to \0, | |
7736 or if there are non-BMP characters, we need to use | |
7737 a mapping dictionary. */ | |
7738 if (PyUnicode_READ(kind, data, 0) != 0) | |
7739 need_dict = 1; | |
7740 for (i = 1; i < 256; i++) { | |
7741 int l1, l2; | |
7742 ch = PyUnicode_READ(kind, data, i); | |
7743 if (ch == 0 || ch > 0xFFFF) { | |
7744 need_dict = 1; | |
7745 break; | |
7746 } | |
7747 if (ch == 0xFFFE) | |
7748 /* unmapped character */ | |
7749 continue; | |
7750 l1 = ch >> 11; | |
7751 l2 = ch >> 7; | |
7752 if (level1[l1] == 0xFF) | |
7753 level1[l1] = count2++; | |
7754 if (level2[l2] == 0xFF) | |
7755 level2[l2] = count3++; | |
7756 } | |
7757 | |
7758 if (count2 >= 0xFF || count3 >= 0xFF) | |
7759 need_dict = 1; | |
7760 | |
7761 if (need_dict) { | |
7762 PyObject *result = PyDict_New(); | |
7763 PyObject *key, *value; | |
7764 if (!result) | |
7765 return NULL; | |
7766 for (i = 0; i < 256; i++) { | |
7767 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); | |
7768 value = PyLong_FromLong(i); | |
7769 if (!key || !value) | |
7770 goto failed1; | |
7771 if (PyDict_SetItem(result, key, value) == -1) | |
7772 goto failed1; | |
7773 Py_DECREF(key); | |
7774 Py_DECREF(value); | |
7775 } | |
7776 return result; | |
7777 failed1: | |
7778 Py_XDECREF(key); | |
7779 Py_XDECREF(value); | |
7780 Py_DECREF(result); | |
7781 return NULL; | |
7782 } | |
7783 | |
7784 /* Create a three-level trie */ | |
7785 result = PyObject_MALLOC(sizeof(struct encoding_map) + | |
7786 16*count2 + 128*count3 - 1); | |
7787 if (!result) | |
7788 return PyErr_NoMemory(); | |
7789 PyObject_Init(result, &EncodingMapType); | |
7790 mresult = (struct encoding_map*)result; | |
7791 mresult->count2 = count2; | |
7792 mresult->count3 = count3; | |
7793 mlevel1 = mresult->level1; | |
7794 mlevel2 = mresult->level23; | |
7795 mlevel3 = mresult->level23 + 16*count2; | |
7796 memcpy(mlevel1, level1, 32); | |
7797 memset(mlevel2, 0xFF, 16*count2); | |
7798 memset(mlevel3, 0, 128*count3); | |
7799 count3 = 0; | |
7800 for (i = 1; i < 256; i++) { | |
7801 int o1, o2, o3, i2, i3; | |
7802 if (PyUnicode_READ(kind, data, i) == 0xFFFE) | |
7803 /* unmapped character */ | |
7804 continue; | |
7805 o1 = PyUnicode_READ(kind, data, i)>>11; | |
7806 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; | |
7807 i2 = 16*mlevel1[o1] + o2; | |
7808 if (mlevel2[i2] == 0xFF) | |
7809 mlevel2[i2] = count3++; | |
7810 o3 = PyUnicode_READ(kind, data, i) & 0x7F; | |
7811 i3 = 128*mlevel2[i2] + o3; | |
7812 mlevel3[i3] = i; | |
7813 } | |
7814 return result; | |
7815 } | |
7816 | |
7817 static int | |
7818 encoding_map_lookup(Py_UCS4 c, PyObject *mapping) | |
7819 { | |
7820 struct encoding_map *map = (struct encoding_map*)mapping; | |
7821 int l1 = c>>11; | |
7822 int l2 = (c>>7) & 0xF; | |
7823 int l3 = c & 0x7F; | |
7824 int i; | |
7825 | |
7826 if (c > 0xFFFF) | |
7827 return -1; | |
7828 if (c == 0) | |
7829 return 0; | |
7830 /* level 1*/ | |
7831 i = map->level1[l1]; | |
7832 if (i == 0xFF) { | |
7833 return -1; | |
7834 } | |
7835 /* level 2*/ | |
7836 i = map->level23[16*i+l2]; | |
7837 if (i == 0xFF) { | |
7838 return -1; | |
7839 } | |
7840 /* level 3 */ | |
7841 i = map->level23[16*map->count2 + 128*i + l3]; | |
7842 if (i == 0) { | |
7843 return -1; | |
7844 } | |
7845 return i; | |
7846 } | |
7847 | |
7848 /* Lookup the character ch in the mapping. If the character | |
7849 can't be found, Py_None is returned (or NULL, if another | |
7850 error occurred). */ | |
7851 static PyObject * | |
7852 charmapencode_lookup(Py_UCS4 c, PyObject *mapping) | |
7853 { | |
7854 PyObject *w = PyLong_FromLong((long)c); | |
7855 PyObject *x; | |
7856 | |
7857 if (w == NULL) | |
7858 return NULL; | |
7859 x = PyObject_GetItem(mapping, w); | |
7860 Py_DECREF(w); | |
7861 if (x == NULL) { | |
7862 if (PyErr_ExceptionMatches(PyExc_LookupError)) { | |
7863 /* No mapping found means: mapping is undefined. */ | |
7864 PyErr_Clear(); | |
7865 x = Py_None; | |
7866 Py_INCREF(x); | |
7867 return x; | |
7868 } else | |
7869 return NULL; | |
7870 } | |
7871 else if (x == Py_None) | |
7872 return x; | |
7873 else if (PyLong_Check(x)) { | |
7874 long value = PyLong_AS_LONG(x); | |
7875 if (value < 0 || value > 255) { | |
7876 PyErr_SetString(PyExc_TypeError, | |
7877 "character mapping must be in range(256)"); | |
7878 Py_DECREF(x); | |
7879 return NULL; | |
7880 } | |
7881 return x; | |
7882 } | |
7883 else if (PyBytes_Check(x)) | |
7884 return x; | |
7885 else { | |
7886 /* wrong return value */ | |
7887 PyErr_Format(PyExc_TypeError, | |
7888 "character mapping must return integer, bytes or None, not %.400s", | |
7889 x->ob_type->tp_name); | |
7890 Py_DECREF(x); | |
7891 return NULL; | |
7892 } | |
7893 } | |
7894 | |
7895 static int | |
7896 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) | |
7897 { | |
7898 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); | |
7899 /* exponentially overallocate to minimize reallocations */ | |
7900 if (requiredsize < 2*outsize) | |
7901 requiredsize = 2*outsize; | |
7902 if (_PyBytes_Resize(outobj, requiredsize)) | |
7903 return -1; | |
7904 return 0; | |
7905 } | |
7906 | |
7907 typedef enum charmapencode_result { | |
7908 enc_SUCCESS, enc_FAILED, enc_EXCEPTION | |
7909 } charmapencode_result; | |
7910 /* lookup the character, put the result in the output string and adjust | |
7911 various state variables. Resize the output bytes object if not enough | |
7912 space is available. Return a new reference to the object that | |
7913 was put in the output buffer, or Py_None, if the mapping was undefined | |
7914 (in which case no character was written) or NULL, if a | |
7915 reallocation error occurred. The caller must decref the result */ | |
7916 static charmapencode_result | |
7917 charmapencode_output(Py_UCS4 c, PyObject *mapping, | |
7918 PyObject **outobj, Py_ssize_t *outpos) | |
7919 { | |
7920 PyObject *rep; | |
7921 char *outstart; | |
7922 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); | |
7923 | |
7924 if (Py_TYPE(mapping) == &EncodingMapType) { | |
7925 int res = encoding_map_lookup(c, mapping); | |
7926 Py_ssize_t requiredsize = *outpos+1; | |
7927 if (res == -1) | |
7928 return enc_FAILED; | |
7929 if (outsize<requiredsize) | |
7930 if (charmapencode_resize(outobj, outpos, requiredsize)) | |
7931 return enc_EXCEPTION; | |
7932 outstart = PyBytes_AS_STRING(*outobj); | |
7933 outstart[(*outpos)++] = (char)res; | |
7934 return enc_SUCCESS; | |
7935 } | |
7936 | |
7937 rep = charmapencode_lookup(c, mapping); | |
7938 if (rep==NULL) | |
7939 return enc_EXCEPTION; | |
7940 else if (rep==Py_None) { | |
7941 Py_DECREF(rep); | |
7942 return enc_FAILED; | |
7943 } else { | |
7944 if (PyLong_Check(rep)) { | |
7945 Py_ssize_t requiredsize = *outpos+1; | |
7946 if (outsize<requiredsize) | |
7947 if (charmapencode_resize(outobj, outpos, requiredsize)) { | |
7948 Py_DECREF(rep); | |
7949 return enc_EXCEPTION; | |
7950 } | |
7951 outstart = PyBytes_AS_STRING(*outobj); | |
7952 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); | |
7953 } | |
7954 else { | |
7955 const char *repchars = PyBytes_AS_STRING(rep); | |
7956 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); | |
7957 Py_ssize_t requiredsize = *outpos+repsize; | |
7958 if (outsize<requiredsize) | |
7959 if (charmapencode_resize(outobj, outpos, requiredsize)) { | |
7960 Py_DECREF(rep); | |
7961 return enc_EXCEPTION; | |
7962 } | |
7963 outstart = PyBytes_AS_STRING(*outobj); | |
7964 memcpy(outstart + *outpos, repchars, repsize); | |
7965 *outpos += repsize; | |
7966 } | |
7967 } | |
7968 Py_DECREF(rep); | |
7969 return enc_SUCCESS; | |
7970 } | |
7971 | |
7972 /* handle an error in PyUnicode_EncodeCharmap | |
7973 Return 0 on success, -1 on error */ | |
7974 static int | |
7975 charmap_encoding_error( | |
7976 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, | |
7977 PyObject **exceptionObject, | |
7978 int *known_errorHandler, PyObject **errorHandler, const char *errors, | |
7979 PyObject **res, Py_ssize_t *respos) | |
7980 { | |
7981 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ | |
7982 Py_ssize_t size, repsize; | |
7983 Py_ssize_t newpos; | |
7984 enum PyUnicode_Kind kind; | |
7985 void *data; | |
7986 Py_ssize_t index; | |
7987 /* startpos for collecting unencodable chars */ | |
7988 Py_ssize_t collstartpos = *inpos; | |
7989 Py_ssize_t collendpos = *inpos+1; | |
7990 Py_ssize_t collpos; | |
7991 char *encoding = "charmap"; | |
7992 char *reason = "character maps to <undefined>"; | |
7993 charmapencode_result x; | |
7994 Py_UCS4 ch; | |
7995 int val; | |
7996 | |
7997 if (PyUnicode_READY(unicode) < 0) | |
7998 return -1; | |
7999 size = PyUnicode_GET_LENGTH(unicode); | |
8000 /* find all unencodable characters */ | |
8001 while (collendpos < size) { | |
8002 PyObject *rep; | |
8003 if (Py_TYPE(mapping) == &EncodingMapType) { | |
8004 ch = PyUnicode_READ_CHAR(unicode, collendpos); | |
8005 val = encoding_map_lookup(ch, mapping); | |
8006 if (val != -1) | |
8007 break; | |
8008 ++collendpos; | |
8009 continue; | |
8010 } | |
8011 | |
8012 ch = PyUnicode_READ_CHAR(unicode, collendpos); | |
8013 rep = charmapencode_lookup(ch, mapping); | |
8014 if (rep==NULL) | |
8015 return -1; | |
8016 else if (rep!=Py_None) { | |
8017 Py_DECREF(rep); | |
8018 break; | |
8019 } | |
8020 Py_DECREF(rep); | |
8021 ++collendpos; | |
8022 } | |
8023 /* cache callback name lookup | |
8024 * (if not done yet, i.e. it's the first error) */ | |
8025 if (*known_errorHandler==-1) { | |
8026 if ((errors==NULL) || (!strcmp(errors, "strict"))) | |
8027 *known_errorHandler = 1; | |
8028 else if (!strcmp(errors, "replace")) | |
8029 *known_errorHandler = 2; | |
8030 else if (!strcmp(errors, "ignore")) | |
8031 *known_errorHandler = 3; | |
8032 else if (!strcmp(errors, "xmlcharrefreplace")) | |
8033 *known_errorHandler = 4; | |
8034 else | |
8035 *known_errorHandler = 0; | |
8036 } | |
8037 switch (*known_errorHandler) { | |
8038 case 1: /* strict */ | |
8039 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); | |
8040 return -1; | |
8041 case 2: /* replace */ | |
8042 for (collpos = collstartpos; collpos<collendpos; ++collpos) { | |
8043 x = charmapencode_output('?', mapping, res, respos); | |
8044 if (x==enc_EXCEPTION) { | |
8045 return -1; | |
8046 } | |
8047 else if (x==enc_FAILED) { | |
8048 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); | |
8049 return -1; | |
8050 } | |
8051 } | |
8052 /* fall through */ | |
8053 case 3: /* ignore */ | |
8054 *inpos = collendpos; | |
8055 break; | |
8056 case 4: /* xmlcharrefreplace */ | |
8057 /* generate replacement (temporarily (mis)uses p) */ | |
8058 for (collpos = collstartpos; collpos < collendpos; ++collpos) { | |
8059 char buffer[2+29+1+1]; | |
8060 char *cp; | |
8061 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); | |
8062 for (cp = buffer; *cp; ++cp) { | |
8063 x = charmapencode_output(*cp, mapping, res, respos); | |
8064 if (x==enc_EXCEPTION) | |
8065 return -1; | |
8066 else if (x==enc_FAILED) { | |
8067 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); | |
8068 return -1; | |
8069 } | |
8070 } | |
8071 } | |
8072 *inpos = collendpos; | |
8073 break; | |
8074 default: | |
8075 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, | |
8076 encoding, reason, unicode, exceptionObject, | |
8077 collstartpos, collendpos, &newpos); | |
8078 if (repunicode == NULL) | |
8079 return -1; | |
8080 if (PyBytes_Check(repunicode)) { | |
8081 /* Directly copy bytes result to output. */ | |
8082 Py_ssize_t outsize = PyBytes_Size(*res); | |
8083 Py_ssize_t requiredsize; | |
8084 repsize = PyBytes_Size(repunicode); | |
8085 requiredsize = *respos + repsize; | |
8086 if (requiredsize > outsize) | |
8087 /* Make room for all additional bytes. */ | |
8088 if (charmapencode_resize(res, respos, requiredsize)) { | |
8089 Py_DECREF(repunicode); | |
8090 return -1; | |
8091 } | |
8092 memcpy(PyBytes_AsString(*res) + *respos, | |
8093 PyBytes_AsString(repunicode), repsize); | |
8094 *respos += repsize; | |
8095 *inpos = newpos; | |
8096 Py_DECREF(repunicode); | |
8097 break; | |
8098 } | |
8099 /* generate replacement */ | |
8100 if (PyUnicode_READY(repunicode) < 0) { | |
8101 Py_DECREF(repunicode); | |
8102 return -1; | |
8103 } | |
8104 repsize = PyUnicode_GET_LENGTH(repunicode); | |
8105 data = PyUnicode_DATA(repunicode); | |
8106 kind = PyUnicode_KIND(repunicode); | |
8107 for (index = 0; index < repsize; index++) { | |
8108 Py_UCS4 repch = PyUnicode_READ(kind, data, index); | |
8109 x = charmapencode_output(repch, mapping, res, respos); | |
8110 if (x==enc_EXCEPTION) { | |
8111 Py_DECREF(repunicode); | |
8112 return -1; | |
8113 } | |
8114 else if (x==enc_FAILED) { | |
8115 Py_DECREF(repunicode); | |
8116 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); | |
8117 return -1; | |
8118 } | |
8119 } | |
8120 *inpos = newpos; | |
8121 Py_DECREF(repunicode); | |
8122 } | |
8123 return 0; | |
8124 } | |
8125 | |
8126 PyObject * | |
8127 _PyUnicode_EncodeCharmap(PyObject *unicode, | |
8128 PyObject *mapping, | |
8129 const char *errors) | |
8130 { | |
8131 /* output object */ | |
8132 PyObject *res = NULL; | |
8133 /* current input position */ | |
8134 Py_ssize_t inpos = 0; | |
8135 Py_ssize_t size; | |
8136 /* current output position */ | |
8137 Py_ssize_t respos = 0; | |
8138 PyObject *errorHandler = NULL; | |
8139 PyObject *exc = NULL; | |
8140 /* the following variable is used for caching string comparisons | |
8141 * -1=not initialized, 0=unknown, 1=strict, 2=replace, | |
8142 * 3=ignore, 4=xmlcharrefreplace */ | |
8143 int known_errorHandler = -1; | |
8144 | |
8145 if (PyUnicode_READY(unicode) < 0) | |
8146 return NULL; | |
8147 size = PyUnicode_GET_LENGTH(unicode); | |
8148 | |
8149 /* Default to Latin-1 */ | |
8150 if (mapping == NULL) | |
8151 return unicode_encode_ucs1(unicode, errors, 256); | |
8152 | |
8153 /* allocate enough for a simple encoding without | |
8154 replacements, if we need more, we'll resize */ | |
8155 res = PyBytes_FromStringAndSize(NULL, size); | |
8156 if (res == NULL) | |
8157 goto onError; | |
8158 if (size == 0) | |
8159 return res; | |
8160 | |
8161 while (inpos<size) { | |
8162 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); | |
8163 /* try to encode it */ | |
8164 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); | |
8165 if (x==enc_EXCEPTION) /* error */ | |
8166 goto onError; | |
8167 if (x==enc_FAILED) { /* unencodable character */ | |
8168 if (charmap_encoding_error(unicode, &inpos, mapping, | |
8169 &exc, | |
8170 &known_errorHandler, &errorHandler, errors, | |
8171 &res, &respos)) { | |
8172 goto onError; | |
8173 } | |
8174 } | |
8175 else | |
8176 /* done with this character => adjust input position */ | |
8177 ++inpos; | |
8178 } | |
8179 | |
8180 /* Resize if we allocated to much */ | |
8181 if (respos<PyBytes_GET_SIZE(res)) | |
8182 if (_PyBytes_Resize(&res, respos) < 0) | |
8183 goto onError; | |
8184 | |
8185 Py_XDECREF(exc); | |
8186 Py_XDECREF(errorHandler); | |
8187 return res; | |
8188 | |
8189 onError: | |
8190 Py_XDECREF(res); | |
8191 Py_XDECREF(exc); | |
8192 Py_XDECREF(errorHandler); | |
8193 return NULL; | |
8194 } | |
8195 | |
8196 /* Deprecated */ | |
8197 PyObject * | |
8198 PyUnicode_EncodeCharmap(const Py_UNICODE *p, | |
8199 Py_ssize_t size, | |
8200 PyObject *mapping, | |
8201 const char *errors) | |
8202 { | |
8203 PyObject *result; | |
8204 PyObject *unicode = PyUnicode_FromUnicode(p, size); | |
8205 if (unicode == NULL) | |
8206 return NULL; | |
8207 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); | |
8208 Py_DECREF(unicode); | |
8209 return result; | |
8210 } | |
8211 | |
8212 PyObject * | |
8213 PyUnicode_AsCharmapString(PyObject *unicode, | |
8214 PyObject *mapping) | |
8215 { | |
8216 if (!PyUnicode_Check(unicode) || mapping == NULL) { | |
8217 PyErr_BadArgument(); | |
8218 return NULL; | |
8219 } | |
8220 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); | |
8221 } | |
8222 | |
8223 /* create or adjust a UnicodeTranslateError */ | |
8224 static void | |
8225 make_translate_exception(PyObject **exceptionObject, | |
8226 PyObject *unicode, | |
8227 Py_ssize_t startpos, Py_ssize_t endpos, | |
8228 const char *reason) | |
8229 { | |
8230 if (*exceptionObject == NULL) { | |
8231 *exceptionObject = _PyUnicodeTranslateError_Create( | |
8232 unicode, startpos, endpos, reason); | |
8233 } | |
8234 else { | |
8235 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) | |
8236 goto onError; | |
8237 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) | |
8238 goto onError; | |
8239 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) | |
8240 goto onError; | |
8241 return; | |
8242 onError: | |
8243 Py_DECREF(*exceptionObject); | |
8244 *exceptionObject = NULL; | |
8245 } | |
8246 } | |
8247 | |
8248 /* raises a UnicodeTranslateError */ | |
8249 static void | |
8250 raise_translate_exception(PyObject **exceptionObject, | |
8251 PyObject *unicode, | |
8252 Py_ssize_t startpos, Py_ssize_t endpos, | |
8253 const char *reason) | |
8254 { | |
8255 make_translate_exception(exceptionObject, | |
8256 unicode, startpos, endpos, reason); | |
8257 if (*exceptionObject != NULL) | |
8258 PyCodec_StrictErrors(*exceptionObject); | |
8259 } | |
8260 | |
8261 /* error handling callback helper: | |
8262 build arguments, call the callback and check the arguments, | |
8263 put the result into newpos and return the replacement string, which | |
8264 has to be freed by the caller */ | |
8265 static PyObject * | |
8266 unicode_translate_call_errorhandler(const char *errors, | |
8267 PyObject **errorHandler, | |
8268 const char *reason, | |
8269 PyObject *unicode, PyObject **exceptionObject, | |
8270 Py_ssize_t startpos, Py_ssize_t endpos, | |
8271 Py_ssize_t *newpos) | |
8272 { | |
8273 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; | |
8274 | |
8275 Py_ssize_t i_newpos; | |
8276 PyObject *restuple; | |
8277 PyObject *resunicode; | |
8278 | |
8279 if (*errorHandler == NULL) { | |
8280 *errorHandler = PyCodec_LookupError(errors); | |
8281 if (*errorHandler == NULL) | |
8282 return NULL; | |
8283 } | |
8284 | |
8285 make_translate_exception(exceptionObject, | |
8286 unicode, startpos, endpos, reason); | |
8287 if (*exceptionObject == NULL) | |
8288 return NULL; | |
8289 | |
8290 restuple = PyObject_CallFunctionObjArgs( | |
8291 *errorHandler, *exceptionObject, NULL); | |
8292 if (restuple == NULL) | |
8293 return NULL; | |
8294 if (!PyTuple_Check(restuple)) { | |
8295 PyErr_SetString(PyExc_TypeError, &argparse[4]); | |
8296 Py_DECREF(restuple); | |
8297 return NULL; | |
8298 } | |
8299 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, | |
8300 &resunicode, &i_newpos)) { | |
8301 Py_DECREF(restuple); | |
8302 return NULL; | |
8303 } | |
8304 if (i_newpos<0) | |
8305 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; | |
8306 else | |
8307 *newpos = i_newpos; | |
8308 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { | |
8309 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); | |
8310 Py_DECREF(restuple); | |
8311 return NULL; | |
8312 } | |
8313 Py_INCREF(resunicode); | |
8314 Py_DECREF(restuple); | |
8315 return resunicode; | |
8316 } | |
8317 | |
8318 /* Lookup the character ch in the mapping and put the result in result, | |
8319 which must be decrefed by the caller. | |
8320 Return 0 on success, -1 on error */ | |
8321 static int | |
8322 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) | |
8323 { | |
8324 PyObject *w = PyLong_FromLong((long)c); | |
8325 PyObject *x; | |
8326 | |
8327 if (w == NULL) | |
8328 return -1; | |
8329 x = PyObject_GetItem(mapping, w); | |
8330 Py_DECREF(w); | |
8331 if (x == NULL) { | |
8332 if (PyErr_ExceptionMatches(PyExc_LookupError)) { | |
8333 /* No mapping found means: use 1:1 mapping. */ | |
8334 PyErr_Clear(); | |
8335 *result = NULL; | |
8336 return 0; | |
8337 } else | |
8338 return -1; | |
8339 } | |
8340 else if (x == Py_None) { | |
8341 *result = x; | |
8342 return 0; | |
8343 } | |
8344 else if (PyLong_Check(x)) { | |
8345 long value = PyLong_AS_LONG(x); | |
8346 long max = PyUnicode_GetMax(); | |
8347 if (value < 0 || value > max) { | |
8348 PyErr_Format(PyExc_TypeError, | |
8349 "character mapping must be in range(0x%x)", max+1); | |
8350 Py_DECREF(x); | |
8351 return -1; | |
8352 } | |
8353 *result = x; | |
8354 return 0; | |
8355 } | |
8356 else if (PyUnicode_Check(x)) { | |
8357 *result = x; | |
8358 return 0; | |
8359 } | |
8360 else { | |
8361 /* wrong return value */ | |
8362 PyErr_SetString(PyExc_TypeError, | |
8363 "character mapping must return integer, None or str"); | |
8364 Py_DECREF(x); | |
8365 return -1; | |
8366 } | |
8367 } | |
8368 /* ensure that *outobj is at least requiredsize characters long, | |
8369 if not reallocate and adjust various state variables. | |
8370 Return 0 on success, -1 on error */ | |
8371 static int | |
8372 charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, | |
8373 Py_ssize_t requiredsize) | |
8374 { | |
8375 Py_ssize_t oldsize = *psize; | |
8376 if (requiredsize > oldsize) { | |
8377 /* exponentially overallocate to minimize reallocations */ | |
8378 if (requiredsize < 2 * oldsize) | |
8379 requiredsize = 2 * oldsize; | |
8380 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); | |
8381 if (*outobj == 0) | |
8382 return -1; | |
8383 *psize = requiredsize; | |
8384 } | |
8385 return 0; | |
8386 } | |
8387 /* lookup the character, put the result in the output string and adjust | |
8388 various state variables. Return a new reference to the object that | |
8389 was put in the output buffer in *result, or Py_None, if the mapping was | |
8390 undefined (in which case no character was written). | |
8391 The called must decref result. | |
8392 Return 0 on success, -1 on error. */ | |
8393 static int | |
8394 charmaptranslate_output(PyObject *input, Py_ssize_t ipos, | |
8395 PyObject *mapping, Py_UCS4 **output, | |
8396 Py_ssize_t *osize, Py_ssize_t *opos, | |
8397 PyObject **res) | |
8398 { | |
8399 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); | |
8400 if (charmaptranslate_lookup(curinp, mapping, res)) | |
8401 return -1; | |
8402 if (*res==NULL) { | |
8403 /* not found => default to 1:1 mapping */ | |
8404 (*output)[(*opos)++] = curinp; | |
8405 } | |
8406 else if (*res==Py_None) | |
8407 ; | |
8408 else if (PyLong_Check(*res)) { | |
8409 /* no overflow check, because we know that the space is enough */ | |
8410 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); | |
8411 } | |
8412 else if (PyUnicode_Check(*res)) { | |
8413 Py_ssize_t repsize; | |
8414 if (PyUnicode_READY(*res) == -1) | |
8415 return -1; | |
8416 repsize = PyUnicode_GET_LENGTH(*res); | |
8417 if (repsize==1) { | |
8418 /* no overflow check, because we know that the space is enough */ | |
8419 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); | |
8420 } | |
8421 else if (repsize!=0) { | |
8422 /* more than one character */ | |
8423 Py_ssize_t requiredsize = *opos + | |
8424 (PyUnicode_GET_LENGTH(input) - ipos) + | |
8425 repsize - 1; | |
8426 Py_ssize_t i; | |
8427 if (charmaptranslate_makespace(output, osize, requiredsize)) | |
8428 return -1; | |
8429 for(i = 0; i < repsize; i++) | |
8430 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); | |
8431 } | |
8432 } | |
8433 else | |
8434 return -1; | |
8435 return 0; | |
8436 } | |
8437 | |
8438 PyObject * | |
8439 _PyUnicode_TranslateCharmap(PyObject *input, | |
8440 PyObject *mapping, | |
8441 const char *errors) | |
8442 { | |
8443 /* input object */ | |
8444 char *idata; | |
8445 Py_ssize_t size, i; | |
8446 int kind; | |
8447 /* output buffer */ | |
8448 Py_UCS4 *output = NULL; | |
8449 Py_ssize_t osize; | |
8450 PyObject *res; | |
8451 /* current output position */ | |
8452 Py_ssize_t opos; | |
8453 char *reason = "character maps to <undefined>"; | |
8454 PyObject *errorHandler = NULL; | |
8455 PyObject *exc = NULL; | |
8456 /* the following variable is used for caching string comparisons | |
8457 * -1=not initialized, 0=unknown, 1=strict, 2=replace, | |
8458 * 3=ignore, 4=xmlcharrefreplace */ | |
8459 int known_errorHandler = -1; | |
8460 | |
8461 if (mapping == NULL) { | |
8462 PyErr_BadArgument(); | |
8463 return NULL; | |
8464 } | |
8465 | |
8466 if (PyUnicode_READY(input) == -1) | |
8467 return NULL; | |
8468 idata = (char*)PyUnicode_DATA(input); | |
8469 kind = PyUnicode_KIND(input); | |
8470 size = PyUnicode_GET_LENGTH(input); | |
8471 i = 0; | |
8472 | |
8473 if (size == 0) { | |
8474 Py_INCREF(input); | |
8475 return input; | |
8476 } | |
8477 | |
8478 /* allocate enough for a simple 1:1 translation without | |
8479 replacements, if we need more, we'll resize */ | |
8480 osize = size; | |
8481 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); | |
8482 opos = 0; | |
8483 if (output == NULL) { | |
8484 PyErr_NoMemory(); | |
8485 goto onError; | |
8486 } | |
8487 | |
8488 while (i<size) { | |
8489 /* try to encode it */ | |
8490 PyObject *x = NULL; | |
8491 if (charmaptranslate_output(input, i, mapping, | |
8492 &output, &osize, &opos, &x)) { | |
8493 Py_XDECREF(x); | |
8494 goto onError; | |
8495 } | |
8496 Py_XDECREF(x); | |
8497 if (x!=Py_None) /* it worked => adjust input pointer */ | |
8498 ++i; | |
8499 else { /* untranslatable character */ | |
8500 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ | |
8501 Py_ssize_t repsize; | |
8502 Py_ssize_t newpos; | |
8503 Py_ssize_t uni2; | |
8504 /* startpos for collecting untranslatable chars */ | |
8505 Py_ssize_t collstart = i; | |
8506 Py_ssize_t collend = i+1; | |
8507 Py_ssize_t coll; | |
8508 | |
8509 /* find all untranslatable characters */ | |
8510 while (collend < size) { | |
8511 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) | |
8512 goto onError; | |
8513 Py_XDECREF(x); | |
8514 if (x!=Py_None) | |
8515 break; | |
8516 ++collend; | |
8517 } | |
8518 /* cache callback name lookup | |
8519 * (if not done yet, i.e. it's the first error) */ | |
8520 if (known_errorHandler==-1) { | |
8521 if ((errors==NULL) || (!strcmp(errors, "strict"))) | |
8522 known_errorHandler = 1; | |
8523 else if (!strcmp(errors, "replace")) | |
8524 known_errorHandler = 2; | |
8525 else if (!strcmp(errors, "ignore")) | |
8526 known_errorHandler = 3; | |
8527 else if (!strcmp(errors, "xmlcharrefreplace")) | |
8528 known_errorHandler = 4; | |
8529 else | |
8530 known_errorHandler = 0; | |
8531 } | |
8532 switch (known_errorHandler) { | |
8533 case 1: /* strict */ | |
8534 raise_translate_exception(&exc, input, collstart, | |
8535 collend, reason); | |
8536 goto onError; | |
8537 case 2: /* replace */ | |
8538 /* No need to check for space, this is a 1:1 replacement */ | |
8539 for (coll = collstart; coll<collend; coll++) | |
8540 output[opos++] = '?'; | |
8541 /* fall through */ | |
8542 case 3: /* ignore */ | |
8543 i = collend; | |
8544 break; | |
8545 case 4: /* xmlcharrefreplace */ | |
8546 /* generate replacement (temporarily (mis)uses i) */ | |
8547 for (i = collstart; i < collend; ++i) { | |
8548 char buffer[2+29+1+1]; | |
8549 char *cp; | |
8550 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); | |
8551 if (charmaptranslate_makespace(&output, &osize, | |
8552 opos+strlen(buffer)+(size-collend))) | |
8553 goto onError; | |
8554 for (cp = buffer; *cp; ++cp) | |
8555 output[opos++] = *cp; | |
8556 } | |
8557 i = collend; | |
8558 break; | |
8559 default: | |
8560 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, | |
8561 reason, input, &exc, | |
8562 collstart, collend, &newpos); | |
8563 if (repunicode == NULL) | |
8564 goto onError; | |
8565 if (PyUnicode_READY(repunicode) < 0) { | |
8566 Py_DECREF(repunicode); | |
8567 goto onError; | |
8568 } | |
8569 /* generate replacement */ | |
8570 repsize = PyUnicode_GET_LENGTH(repunicode); | |
8571 if (charmaptranslate_makespace(&output, &osize, | |
8572 opos+repsize+(size-collend))) { | |
8573 Py_DECREF(repunicode); | |
8574 goto onError; | |
8575 } | |
8576 for (uni2 = 0; repsize-->0; ++uni2) | |
8577 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); | |
8578 i = newpos; | |
8579 Py_DECREF(repunicode); | |
8580 } | |
8581 } | |
8582 } | |
8583 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); | |
8584 if (!res) | |
8585 goto onError; | |
8586 PyMem_Free(output); | |
8587 Py_XDECREF(exc); | |
8588 Py_XDECREF(errorHandler); | |
8589 return res; | |
8590 | |
8591 onError: | |
8592 PyMem_Free(output); | |
8593 Py_XDECREF(exc); | |
8594 Py_XDECREF(errorHandler); | |
8595 return NULL; | |
8596 } | |
8597 | |
8598 /* Deprecated. Use PyUnicode_Translate instead. */ | |
8599 PyObject * | |
8600 PyUnicode_TranslateCharmap(const Py_UNICODE *p, | |
8601 Py_ssize_t size, | |
8602 PyObject *mapping, | |
8603 const char *errors) | |
8604 { | |
8605 PyObject *unicode = PyUnicode_FromUnicode(p, size); | |
8606 if (!unicode) | |
8607 return NULL; | |
8608 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); | |
8609 } | |
8610 | |
8611 PyObject * | |
8612 PyUnicode_Translate(PyObject *str, | |
8613 PyObject *mapping, | |
8614 const char *errors) | |
8615 { | |
8616 PyObject *result; | |
8617 | |
8618 str = PyUnicode_FromObject(str); | |
8619 if (str == NULL) | |
8620 goto onError; | |
8621 result = _PyUnicode_TranslateCharmap(str, mapping, errors); | |
8622 Py_DECREF(str); | |
8623 return result; | |
8624 | |
8625 onError: | |
8626 Py_XDECREF(str); | |
8627 return NULL; | |
8628 } | |
8629 | |
8630 static Py_UCS4 | |
8631 fix_decimal_and_space_to_ascii(PyObject *self) | |
8632 { | |
8633 /* No need to call PyUnicode_READY(self) because this function is only | |
8634 called as a callback from fixup() which does it already. */ | |
8635 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); | |
8636 const int kind = PyUnicode_KIND(self); | |
8637 void *data = PyUnicode_DATA(self); | |
8638 Py_UCS4 maxchar = 0, ch, fixed; | |
8639 Py_ssize_t i; | |
8640 | |
8641 for (i = 0; i < len; ++i) { | |
8642 ch = PyUnicode_READ(kind, data, i); | |
8643 fixed = 0; | |
8644 if (ch > 127) { | |
8645 if (Py_UNICODE_ISSPACE(ch)) | |
8646 fixed = ' '; | |
8647 else { | |
8648 const int decimal = Py_UNICODE_TODECIMAL(ch); | |
8649 if (decimal >= 0) | |
8650 fixed = '0' + decimal; | |
8651 } | |
8652 if (fixed != 0) { | |
8653 if (fixed > maxchar) | |
8654 maxchar = fixed; | |
8655 PyUnicode_WRITE(kind, data, i, fixed); | |
8656 } | |
8657 else if (ch > maxchar) | |
8658 maxchar = ch; | |
8659 } | |
8660 else if (ch > maxchar) | |
8661 maxchar = ch; | |
8662 } | |
8663 | |
8664 return maxchar; | |
8665 } | |
8666 | |
8667 PyObject * | |
8668 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) | |
8669 { | |
8670 if (!PyUnicode_Check(unicode)) { | |
8671 PyErr_BadInternalCall(); | |
8672 return NULL; | |
8673 } | |
8674 if (PyUnicode_READY(unicode) == -1) | |
8675 return NULL; | |
8676 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { | |
8677 /* If the string is already ASCII, just return the same string */ | |
8678 Py_INCREF(unicode); | |
8679 return unicode; | |
8680 } | |
8681 return fixup(unicode, fix_decimal_and_space_to_ascii); | |
8682 } | |
8683 | |
8684 PyObject * | |
8685 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, | |
8686 Py_ssize_t length) | |
8687 { | |
8688 PyObject *decimal; | |
8689 Py_ssize_t i; | |
8690 Py_UCS4 maxchar; | |
8691 enum PyUnicode_Kind kind; | |
8692 void *data; | |
8693 | |
8694 maxchar = 0; | |
8695 for (i = 0; i < length; i++) { | |
8696 Py_UNICODE ch = s[i]; | |
8697 if (ch > 127) { | |
8698 int decimal = Py_UNICODE_TODECIMAL(ch); | |
8699 if (decimal >= 0) | |
8700 ch = '0' + decimal; | |
8701 } | |
8702 maxchar = Py_MAX(maxchar, ch); | |
8703 } | |
8704 | |
8705 /* Copy to a new string */ | |
8706 decimal = PyUnicode_New(length, maxchar); | |
8707 if (decimal == NULL) | |
8708 return decimal; | |
8709 kind = PyUnicode_KIND(decimal); | |
8710 data = PyUnicode_DATA(decimal); | |
8711 /* Iterate over code points */ | |
8712 for (i = 0; i < length; i++) { | |
8713 Py_UNICODE ch = s[i]; | |
8714 if (ch > 127) { | |
8715 int decimal = Py_UNICODE_TODECIMAL(ch); | |
8716 if (decimal >= 0) | |
8717 ch = '0' + decimal; | |
8718 } | |
8719 PyUnicode_WRITE(kind, data, i, ch); | |
8720 } | |
8721 return unicode_result(decimal); | |
8722 } | |
8723 /* --- Decimal Encoder ---------------------------------------------------- */ | |
8724 | |
8725 int | |
8726 PyUnicode_EncodeDecimal(Py_UNICODE *s, | |
8727 Py_ssize_t length, | |
8728 char *output, | |
8729 const char *errors) | |
8730 { | |
8731 PyObject *unicode; | |
8732 Py_ssize_t i; | |
8733 enum PyUnicode_Kind kind; | |
8734 void *data; | |
8735 | |
8736 if (output == NULL) { | |
8737 PyErr_BadArgument(); | |
8738 return -1; | |
8739 } | |
8740 | |
8741 unicode = PyUnicode_FromUnicode(s, length); | |
8742 if (unicode == NULL) | |
8743 return -1; | |
8744 | |
8745 if (PyUnicode_READY(unicode) < 0) { | |
8746 Py_DECREF(unicode); | |
8747 return -1; | |
8748 } | |
8749 kind = PyUnicode_KIND(unicode); | |
8750 data = PyUnicode_DATA(unicode); | |
8751 | |
8752 for (i=0; i < length; ) { | |
8753 PyObject *exc; | |
8754 Py_UCS4 ch; | |
8755 int decimal; | |
8756 Py_ssize_t startpos; | |
8757 | |
8758 ch = PyUnicode_READ(kind, data, i); | |
8759 | |
8760 if (Py_UNICODE_ISSPACE(ch)) { | |
8761 *output++ = ' '; | |
8762 i++; | |
8763 continue; | |
8764 } | |
8765 decimal = Py_UNICODE_TODECIMAL(ch); | |
8766 if (decimal >= 0) { | |
8767 *output++ = '0' + decimal; | |
8768 i++; | |
8769 continue; | |
8770 } | |
8771 if (0 < ch && ch < 256) { | |
8772 *output++ = (char)ch; | |
8773 i++; | |
8774 continue; | |
8775 } | |
8776 | |
8777 startpos = i; | |
8778 exc = NULL; | |
8779 raise_encode_exception(&exc, "decimal", unicode, | |
8780 startpos, startpos+1, | |
8781 "invalid decimal Unicode string"); | |
8782 Py_XDECREF(exc); | |
8783 Py_DECREF(unicode); | |
8784 return -1; | |
8785 } | |
8786 /* 0-terminate the output string */ | |
8787 *output++ = '\0'; | |
8788 Py_DECREF(unicode); | |
8789 return 0; | |
8790 } | |
8791 | |
8792 /* --- Helpers ------------------------------------------------------------ */ | |
8793 | |
8794 static Py_ssize_t | |
8795 any_find_slice(int direction, PyObject* s1, PyObject* s2, | |
8796 Py_ssize_t start, | |
8797 Py_ssize_t end) | |
8798 { | |
8799 int kind1, kind2, kind; | |
8800 void *buf1, *buf2; | |
8801 Py_ssize_t len1, len2, result; | |
8802 | |
8803 kind1 = PyUnicode_KIND(s1); | |
8804 kind2 = PyUnicode_KIND(s2); | |
8805 kind = kind1 > kind2 ? kind1 : kind2; | |
8806 buf1 = PyUnicode_DATA(s1); | |
8807 buf2 = PyUnicode_DATA(s2); | |
8808 if (kind1 != kind) | |
8809 buf1 = _PyUnicode_AsKind(s1, kind); | |
8810 if (!buf1) | |
8811 return -2; | |
8812 if (kind2 != kind) | |
8813 buf2 = _PyUnicode_AsKind(s2, kind); | |
8814 if (!buf2) { | |
8815 if (kind1 != kind) PyMem_Free(buf1); | |
8816 return -2; | |
8817 } | |
8818 len1 = PyUnicode_GET_LENGTH(s1); | |
8819 len2 = PyUnicode_GET_LENGTH(s2); | |
8820 | |
8821 if (direction > 0) { | |
8822 switch(kind) { | |
8823 case PyUnicode_1BYTE_KIND: | |
8824 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) | |
8825 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); | |
8826 else | |
8827 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); | |
8828 break; | |
8829 case PyUnicode_2BYTE_KIND: | |
8830 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); | |
8831 break; | |
8832 case PyUnicode_4BYTE_KIND: | |
8833 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); | |
8834 break; | |
8835 default: | |
8836 assert(0); result = -2; | |
8837 } | |
8838 } | |
8839 else { | |
8840 switch(kind) { | |
8841 case PyUnicode_1BYTE_KIND: | |
8842 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) | |
8843 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); | |
8844 else | |
8845 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); | |
8846 break; | |
8847 case PyUnicode_2BYTE_KIND: | |
8848 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); | |
8849 break; | |
8850 case PyUnicode_4BYTE_KIND: | |
8851 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); | |
8852 break; | |
8853 default: | |
8854 assert(0); result = -2; | |
8855 } | |
8856 } | |
8857 | |
8858 if (kind1 != kind) | |
8859 PyMem_Free(buf1); | |
8860 if (kind2 != kind) | |
8861 PyMem_Free(buf2); | |
8862 | |
8863 return result; | |
8864 } | |
8865 | |
8866 Py_ssize_t | |
8867 _PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data, | |
8868 Py_ssize_t n_buffer, | |
8869 void *digits, Py_ssize_t n_digits, | |
8870 Py_ssize_t min_width, | |
8871 const char *grouping, | |
8872 const char *thousands_sep) | |
8873 { | |
8874 switch(kind) { | |
8875 case PyUnicode_1BYTE_KIND: | |
8876 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) | |
8877 return _PyUnicode_ascii_InsertThousandsGrouping( | |
8878 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, | |
8879 min_width, grouping, thousands_sep); | |
8880 else | |
8881 return _PyUnicode_ucs1_InsertThousandsGrouping( | |
8882 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, | |
8883 min_width, grouping, thousands_sep); | |
8884 case PyUnicode_2BYTE_KIND: | |
8885 return _PyUnicode_ucs2_InsertThousandsGrouping( | |
8886 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, | |
8887 min_width, grouping, thousands_sep); | |
8888 case PyUnicode_4BYTE_KIND: | |
8889 return _PyUnicode_ucs4_InsertThousandsGrouping( | |
8890 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, | |
8891 min_width, grouping, thousands_sep); | |
8892 } | |
8893 assert(0); | |
8894 return -1; | |
8895 } | |
8896 | |
8897 | |
8898 /* helper macro to fixup start/end slice values */ | |
8899 #define ADJUST_INDICES(start, end, len) \ | |
8900 if (end > len) \ | |
8901 end = len; \ | |
8902 else if (end < 0) { \ | |
8903 end += len; \ | |
8904 if (end < 0) \ | |
8905 end = 0; \ | |
8906 } \ | |
8907 if (start < 0) { \ | |
8908 start += len; \ | |
8909 if (start < 0) \ | |
8910 start = 0; \ | |
8911 } | |
8912 | |
8913 Py_ssize_t | |
8914 PyUnicode_Count(PyObject *str, | |
8915 PyObject *substr, | |
8916 Py_ssize_t start, | |
8917 Py_ssize_t end) | |
8918 { | |
8919 Py_ssize_t result; | |
8920 PyObject* str_obj; | |
8921 PyObject* sub_obj; | |
8922 int kind1, kind2, kind; | |
8923 void *buf1 = NULL, *buf2 = NULL; | |
8924 Py_ssize_t len1, len2; | |
8925 | |
8926 str_obj = PyUnicode_FromObject(str); | |
8927 if (!str_obj || PyUnicode_READY(str_obj) == -1) | |
8928 return -1; | |
8929 sub_obj = PyUnicode_FromObject(substr); | |
8930 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { | |
8931 Py_DECREF(str_obj); | |
8932 return -1; | |
8933 } | |
8934 | |
8935 kind1 = PyUnicode_KIND(str_obj); | |
8936 kind2 = PyUnicode_KIND(sub_obj); | |
8937 kind = kind1 > kind2 ? kind1 : kind2; | |
8938 buf1 = PyUnicode_DATA(str_obj); | |
8939 if (kind1 != kind) | |
8940 buf1 = _PyUnicode_AsKind(str_obj, kind); | |
8941 if (!buf1) | |
8942 goto onError; | |
8943 buf2 = PyUnicode_DATA(sub_obj); | |
8944 if (kind2 != kind) | |
8945 buf2 = _PyUnicode_AsKind(sub_obj, kind); | |
8946 if (!buf2) | |
8947 goto onError; | |
8948 len1 = PyUnicode_GET_LENGTH(str_obj); | |
8949 len2 = PyUnicode_GET_LENGTH(sub_obj); | |
8950 | |
8951 ADJUST_INDICES(start, end, len1); | |
8952 switch(kind) { | |
8953 case PyUnicode_1BYTE_KIND: | |
8954 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) | |
8955 result = asciilib_count( | |
8956 ((Py_UCS1*)buf1) + start, end - start, | |
8957 buf2, len2, PY_SSIZE_T_MAX | |
8958 ); | |
8959 else | |
8960 result = ucs1lib_count( | |
8961 ((Py_UCS1*)buf1) + start, end - start, | |
8962 buf2, len2, PY_SSIZE_T_MAX | |
8963 ); | |
8964 break; | |
8965 case PyUnicode_2BYTE_KIND: | |
8966 result = ucs2lib_count( | |
8967 ((Py_UCS2*)buf1) + start, end - start, | |
8968 buf2, len2, PY_SSIZE_T_MAX | |
8969 ); | |
8970 break; | |
8971 case PyUnicode_4BYTE_KIND: | |
8972 result = ucs4lib_count( | |
8973 ((Py_UCS4*)buf1) + start, end - start, | |
8974 buf2, len2, PY_SSIZE_T_MAX | |
8975 ); | |
8976 break; | |
8977 default: | |
8978 assert(0); result = 0; | |
8979 } | |
8980 | |
8981 Py_DECREF(sub_obj); | |
8982 Py_DECREF(str_obj); | |
8983 | |
8984 if (kind1 != kind) | |
8985 PyMem_Free(buf1); | |
8986 if (kind2 != kind) | |
8987 PyMem_Free(buf2); | |
8988 | |
8989 return result; | |
8990 onError: | |
8991 Py_DECREF(sub_obj); | |
8992 Py_DECREF(str_obj); | |
8993 if (kind1 != kind && buf1) | |
8994 PyMem_Free(buf1); | |
8995 if (kind2 != kind && buf2) | |
8996 PyMem_Free(buf2); | |
8997 return -1; | |
8998 } | |
8999 | |
9000 Py_ssize_t | |
9001 PyUnicode_Find(PyObject *str, | |
9002 PyObject *sub, | |
9003 Py_ssize_t start, | |
9004 Py_ssize_t end, | |
9005 int direction) | |
9006 { | |
9007 Py_ssize_t result; | |
9008 | |
9009 str = PyUnicode_FromObject(str); | |
9010 if (!str || PyUnicode_READY(str) == -1) | |
9011 return -2; | |
9012 sub = PyUnicode_FromObject(sub); | |
9013 if (!sub || PyUnicode_READY(sub) == -1) { | |
9014 Py_DECREF(str); | |
9015 return -2; | |
9016 } | |
9017 | |
9018 result = any_find_slice(direction, | |
9019 str, sub, start, end | |
9020 ); | |
9021 | |
9022 Py_DECREF(str); | |
9023 Py_DECREF(sub); | |
9024 | |
9025 return result; | |
9026 } | |
9027 | |
9028 Py_ssize_t | |
9029 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, | |
9030 Py_ssize_t start, Py_ssize_t end, | |
9031 int direction) | |
9032 { | |
9033 int kind; | |
9034 Py_ssize_t result; | |
9035 if (PyUnicode_READY(str) == -1) | |
9036 return -2; | |
9037 if (start < 0 || end < 0) { | |
9038 PyErr_SetString(PyExc_IndexError, "string index out of range"); | |
9039 return -2; | |
9040 } | |
9041 if (end > PyUnicode_GET_LENGTH(str)) | |
9042 end = PyUnicode_GET_LENGTH(str); | |
9043 kind = PyUnicode_KIND(str); | |
9044 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, | |
9045 kind, end-start, ch, direction); | |
9046 if (result == -1) | |
9047 return -1; | |
9048 else | |
9049 return start + result; | |
9050 } | |
9051 | |
9052 static int | |
9053 tailmatch(PyObject *self, | |
9054 PyObject *substring, | |
9055 Py_ssize_t start, | |
9056 Py_ssize_t end, | |
9057 int direction) | |
9058 { | |
9059 int kind_self; | |
9060 int kind_sub; | |
9061 void *data_self; | |
9062 void *data_sub; | |
9063 Py_ssize_t offset; | |
9064 Py_ssize_t i; | |
9065 Py_ssize_t end_sub; | |
9066 | |
9067 if (PyUnicode_READY(self) == -1 || | |
9068 PyUnicode_READY(substring) == -1) | |
9069 return 0; | |
9070 | |
9071 if (PyUnicode_GET_LENGTH(substring) == 0) | |
9072 return 1; | |
9073 | |
9074 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); | |
9075 end -= PyUnicode_GET_LENGTH(substring); | |
9076 if (end < start) | |
9077 return 0; | |
9078 | |
9079 kind_self = PyUnicode_KIND(self); | |
9080 data_self = PyUnicode_DATA(self); | |
9081 kind_sub = PyUnicode_KIND(substring); | |
9082 data_sub = PyUnicode_DATA(substring); | |
9083 end_sub = PyUnicode_GET_LENGTH(substring) - 1; | |
9084 | |
9085 if (direction > 0) | |
9086 offset = end; | |
9087 else | |
9088 offset = start; | |
9089 | |
9090 if (PyUnicode_READ(kind_self, data_self, offset) == | |
9091 PyUnicode_READ(kind_sub, data_sub, 0) && | |
9092 PyUnicode_READ(kind_self, data_self, offset + end_sub) == | |
9093 PyUnicode_READ(kind_sub, data_sub, end_sub)) { | |
9094 /* If both are of the same kind, memcmp is sufficient */ | |
9095 if (kind_self == kind_sub) { | |
9096 return ! memcmp((char *)data_self + | |
9097 (offset * PyUnicode_KIND(substring)), | |
9098 data_sub, | |
9099 PyUnicode_GET_LENGTH(substring) * | |
9100 PyUnicode_KIND(substring)); | |
9101 } | |
9102 /* otherwise we have to compare each character by first accesing it */ | |
9103 else { | |
9104 /* We do not need to compare 0 and len(substring)-1 because | |
9105 the if statement above ensured already that they are equal | |
9106 when we end up here. */ | |
9107 // TODO: honor direction and do a forward or backwards search | |
9108 for (i = 1; i < end_sub; ++i) { | |
9109 if (PyUnicode_READ(kind_self, data_self, offset + i) != | |
9110 PyUnicode_READ(kind_sub, data_sub, i)) | |
9111 return 0; | |
9112 } | |
9113 return 1; | |
9114 } | |
9115 } | |
9116 | |
9117 return 0; | |
9118 } | |
9119 | |
9120 Py_ssize_t | |
9121 PyUnicode_Tailmatch(PyObject *str, | |
9122 PyObject *substr, | |
9123 Py_ssize_t start, | |
9124 Py_ssize_t end, | |
9125 int direction) | |
9126 { | |
9127 Py_ssize_t result; | |
9128 | |
9129 str = PyUnicode_FromObject(str); | |
9130 if (str == NULL) | |
9131 return -1; | |
9132 substr = PyUnicode_FromObject(substr); | |
9133 if (substr == NULL) { | |
9134 Py_DECREF(str); | |
9135 return -1; | |
9136 } | |
9137 | |
9138 result = tailmatch(str, substr, | |
9139 start, end, direction); | |
9140 Py_DECREF(str); | |
9141 Py_DECREF(substr); | |
9142 return result; | |
9143 } | |
9144 | |
9145 /* Apply fixfct filter to the Unicode object self and return a | |
9146 reference to the modified object */ | |
9147 | |
9148 static PyObject * | |
9149 fixup(PyObject *self, | |
9150 Py_UCS4 (*fixfct)(PyObject *s)) | |
9151 { | |
9152 PyObject *u; | |
9153 Py_UCS4 maxchar_old, maxchar_new = 0; | |
9154 | |
9155 u = PyUnicode_Copy(self); | |
9156 if (u == NULL) | |
9157 return NULL; | |
9158 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); | |
9159 | |
9160 /* fix functions return the new maximum character in a string, | |
9161 if the kind of the resulting unicode object does not change, | |
9162 everything is fine. Otherwise we need to change the string kind | |
9163 and re-run the fix function. */ | |
9164 maxchar_new = fixfct(u); | |
9165 if (maxchar_new == 0) | |
9166 /* do nothing, keep maxchar_new at 0 which means no changes. */; | |
9167 else if (maxchar_new <= 127) | |
9168 maxchar_new = 127; | |
9169 else if (maxchar_new <= 255) | |
9170 maxchar_new = 255; | |
9171 else if (maxchar_new <= 65535) | |
9172 maxchar_new = 65535; | |
9173 else | |
9174 maxchar_new = MAX_UNICODE; | |
9175 | |
9176 if (!maxchar_new && PyUnicode_CheckExact(self)) { | |
9177 /* fixfct should return TRUE if it modified the buffer. If | |
9178 FALSE, return a reference to the original buffer instead | |
9179 (to save space, not time) */ | |
9180 Py_INCREF(self); | |
9181 Py_DECREF(u); | |
9182 return self; | |
9183 } | |
9184 else if (maxchar_new == maxchar_old) { | |
9185 return u; | |
9186 } | |
9187 else { | |
9188 /* In case the maximum character changed, we need to | |
9189 convert the string to the new category. */ | |
9190 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); | |
9191 if (v == NULL) { | |
9192 Py_DECREF(u); | |
9193 return NULL; | |
9194 } | |
9195 if (maxchar_new > maxchar_old) { | |
9196 /* If the maxchar increased so that the kind changed, not all | |
9197 characters are representable anymore and we need to fix the | |
9198 string again. This only happens in very few cases. */ | |
9199 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self)); | |
9200 maxchar_old = fixfct(v); | |
9201 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); | |
9202 } | |
9203 else { | |
9204 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); | |
9205 } | |
9206 | |
9207 Py_DECREF(u); | |
9208 assert(_PyUnicode_CheckConsistency(v, 1)); | |
9209 return v; | |
9210 } | |
9211 } | |
9212 | |
9213 static Py_UCS4 | |
9214 fixupper(PyObject *self) | |
9215 { | |
9216 /* No need to call PyUnicode_READY(self) because this function is only | |
9217 called as a callback from fixup() which does it already. */ | |
9218 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); | |
9219 const int kind = PyUnicode_KIND(self); | |
9220 void *data = PyUnicode_DATA(self); | |
9221 int touched = 0; | |
9222 Py_UCS4 maxchar = 0; | |
9223 Py_ssize_t i; | |
9224 | |
9225 for (i = 0; i < len; ++i) { | |
9226 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
9227 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); | |
9228 if (up != ch) { | |
9229 if (up > maxchar) | |
9230 maxchar = up; | |
9231 PyUnicode_WRITE(kind, data, i, up); | |
9232 touched = 1; | |
9233 } | |
9234 else if (ch > maxchar) | |
9235 maxchar = ch; | |
9236 } | |
9237 | |
9238 if (touched) | |
9239 return maxchar; | |
9240 else | |
9241 return 0; | |
9242 } | |
9243 | |
9244 static Py_UCS4 | |
9245 fixlower(PyObject *self) | |
9246 { | |
9247 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ | |
9248 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); | |
9249 const int kind = PyUnicode_KIND(self); | |
9250 void *data = PyUnicode_DATA(self); | |
9251 int touched = 0; | |
9252 Py_UCS4 maxchar = 0; | |
9253 Py_ssize_t i; | |
9254 | |
9255 for(i = 0; i < len; ++i) { | |
9256 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
9257 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); | |
9258 if (lo != ch) { | |
9259 if (lo > maxchar) | |
9260 maxchar = lo; | |
9261 PyUnicode_WRITE(kind, data, i, lo); | |
9262 touched = 1; | |
9263 } | |
9264 else if (ch > maxchar) | |
9265 maxchar = ch; | |
9266 } | |
9267 | |
9268 if (touched) | |
9269 return maxchar; | |
9270 else | |
9271 return 0; | |
9272 } | |
9273 | |
9274 static Py_UCS4 | |
9275 fixswapcase(PyObject *self) | |
9276 { | |
9277 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ | |
9278 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); | |
9279 const int kind = PyUnicode_KIND(self); | |
9280 void *data = PyUnicode_DATA(self); | |
9281 int touched = 0; | |
9282 Py_UCS4 maxchar = 0; | |
9283 Py_ssize_t i; | |
9284 | |
9285 for(i = 0; i < len; ++i) { | |
9286 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
9287 Py_UCS4 nu = 0; | |
9288 | |
9289 if (Py_UNICODE_ISUPPER(ch)) | |
9290 nu = Py_UNICODE_TOLOWER(ch); | |
9291 else if (Py_UNICODE_ISLOWER(ch)) | |
9292 nu = Py_UNICODE_TOUPPER(ch); | |
9293 | |
9294 if (nu != 0) { | |
9295 if (nu > maxchar) | |
9296 maxchar = nu; | |
9297 PyUnicode_WRITE(kind, data, i, nu); | |
9298 touched = 1; | |
9299 } | |
9300 else if (ch > maxchar) | |
9301 maxchar = ch; | |
9302 } | |
9303 | |
9304 if (touched) | |
9305 return maxchar; | |
9306 else | |
9307 return 0; | |
9308 } | |
9309 | |
9310 static Py_UCS4 | |
9311 fixcapitalize(PyObject *self) | |
9312 { | |
9313 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ | |
9314 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); | |
9315 const int kind = PyUnicode_KIND(self); | |
9316 void *data = PyUnicode_DATA(self); | |
9317 int touched = 0; | |
9318 Py_UCS4 maxchar = 0; | |
9319 Py_ssize_t i = 0; | |
9320 Py_UCS4 ch; | |
9321 | |
9322 if (len == 0) | |
9323 return 0; | |
9324 | |
9325 ch = PyUnicode_READ(kind, data, i); | |
9326 if (!Py_UNICODE_ISUPPER(ch)) { | |
9327 maxchar = Py_UNICODE_TOUPPER(ch); | |
9328 PyUnicode_WRITE(kind, data, i, maxchar); | |
9329 touched = 1; | |
9330 } | |
9331 ++i; | |
9332 for(; i < len; ++i) { | |
9333 ch = PyUnicode_READ(kind, data, i); | |
9334 if (!Py_UNICODE_ISLOWER(ch)) { | |
9335 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); | |
9336 if (lo > maxchar) | |
9337 maxchar = lo; | |
9338 PyUnicode_WRITE(kind, data, i, lo); | |
9339 touched = 1; | |
9340 } | |
9341 else if (ch > maxchar) | |
9342 maxchar = ch; | |
9343 } | |
9344 | |
9345 if (touched) | |
9346 return maxchar; | |
9347 else | |
9348 return 0; | |
9349 } | |
9350 | |
9351 static Py_UCS4 | |
9352 fixtitle(PyObject *self) | |
9353 { | |
9354 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ | |
9355 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); | |
9356 const int kind = PyUnicode_KIND(self); | |
9357 void *data = PyUnicode_DATA(self); | |
9358 Py_UCS4 maxchar = 0; | |
9359 Py_ssize_t i = 0; | |
9360 int previous_is_cased; | |
9361 | |
9362 /* Shortcut for single character strings */ | |
9363 if (len == 1) { | |
9364 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
9365 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); | |
9366 if (ti != ch) { | |
9367 PyUnicode_WRITE(kind, data, i, ti); | |
9368 return ti; | |
9369 } | |
9370 else | |
9371 return 0; | |
9372 } | |
9373 previous_is_cased = 0; | |
9374 for(; i < len; ++i) { | |
9375 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
9376 Py_UCS4 nu; | |
9377 | |
9378 if (previous_is_cased) | |
9379 nu = Py_UNICODE_TOLOWER(ch); | |
9380 else | |
9381 nu = Py_UNICODE_TOTITLE(ch); | |
9382 | |
9383 if (nu > maxchar) | |
9384 maxchar = nu; | |
9385 PyUnicode_WRITE(kind, data, i, nu); | |
9386 | |
9387 if (Py_UNICODE_ISLOWER(ch) || | |
9388 Py_UNICODE_ISUPPER(ch) || | |
9389 Py_UNICODE_ISTITLE(ch)) | |
9390 previous_is_cased = 1; | |
9391 else | |
9392 previous_is_cased = 0; | |
9393 } | |
9394 return maxchar; | |
9395 } | |
9396 | |
9397 PyObject * | |
9398 PyUnicode_Join(PyObject *separator, PyObject *seq) | |
9399 { | |
9400 PyObject *sep = NULL; | |
9401 Py_ssize_t seplen; | |
9402 PyObject *res = NULL; /* the result */ | |
9403 PyObject *fseq; /* PySequence_Fast(seq) */ | |
9404 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ | |
9405 PyObject **items; | |
9406 PyObject *item; | |
9407 Py_ssize_t sz, i, res_offset; | |
9408 Py_UCS4 maxchar; | |
9409 Py_UCS4 item_maxchar; | |
9410 int use_memcpy; | |
9411 unsigned char *res_data = NULL, *sep_data = NULL; | |
9412 PyObject *last_obj; | |
9413 unsigned int kind = 0; | |
9414 | |
9415 fseq = PySequence_Fast(seq, ""); | |
9416 if (fseq == NULL) { | |
9417 return NULL; | |
9418 } | |
9419 | |
9420 /* NOTE: the following code can't call back into Python code, | |
9421 * so we are sure that fseq won't be mutated. | |
9422 */ | |
9423 | |
9424 seqlen = PySequence_Fast_GET_SIZE(fseq); | |
9425 /* If empty sequence, return u"". */ | |
9426 if (seqlen == 0) { | |
9427 Py_DECREF(fseq); | |
9428 Py_INCREF(unicode_empty); | |
9429 res = unicode_empty; | |
9430 return res; | |
9431 } | |
9432 | |
9433 /* If singleton sequence with an exact Unicode, return that. */ | |
9434 last_obj = NULL; | |
9435 items = PySequence_Fast_ITEMS(fseq); | |
9436 if (seqlen == 1) { | |
9437 if (PyUnicode_CheckExact(items[0])) { | |
9438 res = items[0]; | |
9439 Py_INCREF(res); | |
9440 Py_DECREF(fseq); | |
9441 return res; | |
9442 } | |
9443 seplen = 0; | |
9444 maxchar = 0; | |
9445 } | |
9446 else { | |
9447 /* Set up sep and seplen */ | |
9448 if (separator == NULL) { | |
9449 /* fall back to a blank space separator */ | |
9450 sep = PyUnicode_FromOrdinal(' '); | |
9451 if (!sep) | |
9452 goto onError; | |
9453 seplen = 1; | |
9454 maxchar = 32; | |
9455 } | |
9456 else { | |
9457 if (!PyUnicode_Check(separator)) { | |
9458 PyErr_Format(PyExc_TypeError, | |
9459 "separator: expected str instance," | |
9460 " %.80s found", | |
9461 Py_TYPE(separator)->tp_name); | |
9462 goto onError; | |
9463 } | |
9464 if (PyUnicode_READY(separator)) | |
9465 goto onError; | |
9466 sep = separator; | |
9467 seplen = PyUnicode_GET_LENGTH(separator); | |
9468 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); | |
9469 /* inc refcount to keep this code path symmetric with the | |
9470 above case of a blank separator */ | |
9471 Py_INCREF(sep); | |
9472 } | |
9473 last_obj = sep; | |
9474 } | |
9475 | |
9476 /* There are at least two things to join, or else we have a subclass | |
9477 * of str in the sequence. | |
9478 * Do a pre-pass to figure out the total amount of space we'll | |
9479 * need (sz), and see whether all argument are strings. | |
9480 */ | |
9481 sz = 0; | |
9482 #ifdef Py_DEBUG | |
9483 use_memcpy = 0; | |
9484 #else | |
9485 use_memcpy = 1; | |
9486 #endif | |
9487 for (i = 0; i < seqlen; i++) { | |
9488 const Py_ssize_t old_sz = sz; | |
9489 item = items[i]; | |
9490 if (!PyUnicode_Check(item)) { | |
9491 PyErr_Format(PyExc_TypeError, | |
9492 "sequence item %zd: expected str instance," | |
9493 " %.80s found", | |
9494 i, Py_TYPE(item)->tp_name); | |
9495 goto onError; | |
9496 } | |
9497 if (PyUnicode_READY(item) == -1) | |
9498 goto onError; | |
9499 sz += PyUnicode_GET_LENGTH(item); | |
9500 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); | |
9501 maxchar = Py_MAX(maxchar, item_maxchar); | |
9502 if (i != 0) | |
9503 sz += seplen; | |
9504 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { | |
9505 PyErr_SetString(PyExc_OverflowError, | |
9506 "join() result is too long for a Python string"); | |
9507 goto onError; | |
9508 } | |
9509 if (use_memcpy && last_obj != NULL) { | |
9510 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) | |
9511 use_memcpy = 0; | |
9512 } | |
9513 last_obj = item; | |
9514 } | |
9515 | |
9516 res = PyUnicode_New(sz, maxchar); | |
9517 if (res == NULL) | |
9518 goto onError; | |
9519 | |
9520 /* Catenate everything. */ | |
9521 #ifdef Py_DEBUG | |
9522 use_memcpy = 0; | |
9523 #else | |
9524 if (use_memcpy) { | |
9525 res_data = PyUnicode_1BYTE_DATA(res); | |
9526 kind = PyUnicode_KIND(res); | |
9527 if (seplen != 0) | |
9528 sep_data = PyUnicode_1BYTE_DATA(sep); | |
9529 } | |
9530 #endif | |
9531 for (i = 0, res_offset = 0; i < seqlen; ++i) { | |
9532 Py_ssize_t itemlen; | |
9533 item = items[i]; | |
9534 /* Copy item, and maybe the separator. */ | |
9535 if (i && seplen != 0) { | |
9536 if (use_memcpy) { | |
9537 Py_MEMCPY(res_data, | |
9538 sep_data, | |
9539 kind * seplen); | |
9540 res_data += kind * seplen; | |
9541 } | |
9542 else { | |
9543 copy_characters(res, res_offset, sep, 0, seplen); | |
9544 res_offset += seplen; | |
9545 } | |
9546 } | |
9547 itemlen = PyUnicode_GET_LENGTH(item); | |
9548 if (itemlen != 0) { | |
9549 if (use_memcpy) { | |
9550 Py_MEMCPY(res_data, | |
9551 PyUnicode_DATA(item), | |
9552 kind * itemlen); | |
9553 res_data += kind * itemlen; | |
9554 } | |
9555 else { | |
9556 copy_characters(res, res_offset, item, 0, itemlen); | |
9557 res_offset += itemlen; | |
9558 } | |
9559 } | |
9560 } | |
9561 if (use_memcpy) | |
9562 assert(res_data == PyUnicode_1BYTE_DATA(res) | |
9563 + kind * PyUnicode_GET_LENGTH(res)); | |
9564 else | |
9565 assert(res_offset == PyUnicode_GET_LENGTH(res)); | |
9566 | |
9567 Py_DECREF(fseq); | |
9568 Py_XDECREF(sep); | |
9569 assert(_PyUnicode_CheckConsistency(res, 1)); | |
9570 return res; | |
9571 | |
9572 onError: | |
9573 Py_DECREF(fseq); | |
9574 Py_XDECREF(sep); | |
9575 Py_XDECREF(res); | |
9576 return NULL; | |
9577 } | |
9578 | |
9579 #define FILL(kind, data, value, start, length) \ | |
9580 do { \ | |
9581 Py_ssize_t i_ = 0; \ | |
9582 assert(kind != PyUnicode_WCHAR_KIND); \ | |
9583 switch ((kind)) { \ | |
9584 case PyUnicode_1BYTE_KIND: { \ | |
9585 unsigned char * to_ = (unsigned char *)((data)) + (start); \ | |
9586 memset(to_, (unsigned char)value, length); \ | |
9587 break; \ | |
9588 } \ | |
9589 case PyUnicode_2BYTE_KIND: { \ | |
9590 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ | |
9591 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ | |
9592 break; \ | |
9593 } \ | |
9594 default: { \ | |
9595 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ | |
9596 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ | |
9597 break; \ | |
9598 } \ | |
9599 } \ | |
9600 } while (0) | |
9601 | |
9602 static PyObject * | |
9603 pad(PyObject *self, | |
9604 Py_ssize_t left, | |
9605 Py_ssize_t right, | |
9606 Py_UCS4 fill) | |
9607 { | |
9608 PyObject *u; | |
9609 Py_UCS4 maxchar; | |
9610 int kind; | |
9611 void *data; | |
9612 | |
9613 if (left < 0) | |
9614 left = 0; | |
9615 if (right < 0) | |
9616 right = 0; | |
9617 | |
9618 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { | |
9619 Py_INCREF(self); | |
9620 return self; | |
9621 } | |
9622 | |
9623 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || | |
9624 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { | |
9625 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); | |
9626 return NULL; | |
9627 } | |
9628 maxchar = PyUnicode_MAX_CHAR_VALUE(self); | |
9629 if (fill > maxchar) | |
9630 maxchar = fill; | |
9631 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); | |
9632 if (!u) | |
9633 return NULL; | |
9634 | |
9635 kind = PyUnicode_KIND(u); | |
9636 data = PyUnicode_DATA(u); | |
9637 if (left) | |
9638 FILL(kind, data, fill, 0, left); | |
9639 if (right) | |
9640 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); | |
9641 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self)); | |
9642 assert(_PyUnicode_CheckConsistency(u, 1)); | |
9643 return u; | |
9644 } | |
9645 #undef FILL | |
9646 | |
9647 PyObject * | |
9648 PyUnicode_Splitlines(PyObject *string, int keepends) | |
9649 { | |
9650 PyObject *list; | |
9651 | |
9652 string = PyUnicode_FromObject(string); | |
9653 if (string == NULL || PyUnicode_READY(string) == -1) | |
9654 return NULL; | |
9655 | |
9656 switch(PyUnicode_KIND(string)) { | |
9657 case PyUnicode_1BYTE_KIND: | |
9658 if (PyUnicode_IS_ASCII(string)) | |
9659 list = asciilib_splitlines( | |
9660 string, PyUnicode_1BYTE_DATA(string), | |
9661 PyUnicode_GET_LENGTH(string), keepends); | |
9662 else | |
9663 list = ucs1lib_splitlines( | |
9664 string, PyUnicode_1BYTE_DATA(string), | |
9665 PyUnicode_GET_LENGTH(string), keepends); | |
9666 break; | |
9667 case PyUnicode_2BYTE_KIND: | |
9668 list = ucs2lib_splitlines( | |
9669 string, PyUnicode_2BYTE_DATA(string), | |
9670 PyUnicode_GET_LENGTH(string), keepends); | |
9671 break; | |
9672 case PyUnicode_4BYTE_KIND: | |
9673 list = ucs4lib_splitlines( | |
9674 string, PyUnicode_4BYTE_DATA(string), | |
9675 PyUnicode_GET_LENGTH(string), keepends); | |
9676 break; | |
9677 default: | |
9678 assert(0); | |
9679 list = 0; | |
9680 } | |
9681 Py_DECREF(string); | |
9682 return list; | |
9683 } | |
9684 | |
9685 static PyObject * | |
9686 split(PyObject *self, | |
9687 PyObject *substring, | |
9688 Py_ssize_t maxcount) | |
9689 { | |
9690 int kind1, kind2, kind; | |
9691 void *buf1, *buf2; | |
9692 Py_ssize_t len1, len2; | |
9693 PyObject* out; | |
9694 | |
9695 if (maxcount < 0) | |
9696 maxcount = PY_SSIZE_T_MAX; | |
9697 | |
9698 if (PyUnicode_READY(self) == -1) | |
9699 return NULL; | |
9700 | |
9701 if (substring == NULL) | |
9702 switch(PyUnicode_KIND(self)) { | |
9703 case PyUnicode_1BYTE_KIND: | |
9704 if (PyUnicode_IS_ASCII(self)) | |
9705 return asciilib_split_whitespace( | |
9706 self, PyUnicode_1BYTE_DATA(self), | |
9707 PyUnicode_GET_LENGTH(self), maxcount | |
9708 ); | |
9709 else | |
9710 return ucs1lib_split_whitespace( | |
9711 self, PyUnicode_1BYTE_DATA(self), | |
9712 PyUnicode_GET_LENGTH(self), maxcount | |
9713 ); | |
9714 case PyUnicode_2BYTE_KIND: | |
9715 return ucs2lib_split_whitespace( | |
9716 self, PyUnicode_2BYTE_DATA(self), | |
9717 PyUnicode_GET_LENGTH(self), maxcount | |
9718 ); | |
9719 case PyUnicode_4BYTE_KIND: | |
9720 return ucs4lib_split_whitespace( | |
9721 self, PyUnicode_4BYTE_DATA(self), | |
9722 PyUnicode_GET_LENGTH(self), maxcount | |
9723 ); | |
9724 default: | |
9725 assert(0); | |
9726 return NULL; | |
9727 } | |
9728 | |
9729 if (PyUnicode_READY(substring) == -1) | |
9730 return NULL; | |
9731 | |
9732 kind1 = PyUnicode_KIND(self); | |
9733 kind2 = PyUnicode_KIND(substring); | |
9734 kind = kind1 > kind2 ? kind1 : kind2; | |
9735 buf1 = PyUnicode_DATA(self); | |
9736 buf2 = PyUnicode_DATA(substring); | |
9737 if (kind1 != kind) | |
9738 buf1 = _PyUnicode_AsKind(self, kind); | |
9739 if (!buf1) | |
9740 return NULL; | |
9741 if (kind2 != kind) | |
9742 buf2 = _PyUnicode_AsKind(substring, kind); | |
9743 if (!buf2) { | |
9744 if (kind1 != kind) PyMem_Free(buf1); | |
9745 return NULL; | |
9746 } | |
9747 len1 = PyUnicode_GET_LENGTH(self); | |
9748 len2 = PyUnicode_GET_LENGTH(substring); | |
9749 | |
9750 switch(kind) { | |
9751 case PyUnicode_1BYTE_KIND: | |
9752 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) | |
9753 out = asciilib_split( | |
9754 self, buf1, len1, buf2, len2, maxcount); | |
9755 else | |
9756 out = ucs1lib_split( | |
9757 self, buf1, len1, buf2, len2, maxcount); | |
9758 break; | |
9759 case PyUnicode_2BYTE_KIND: | |
9760 out = ucs2lib_split( | |
9761 self, buf1, len1, buf2, len2, maxcount); | |
9762 break; | |
9763 case PyUnicode_4BYTE_KIND: | |
9764 out = ucs4lib_split( | |
9765 self, buf1, len1, buf2, len2, maxcount); | |
9766 break; | |
9767 default: | |
9768 out = NULL; | |
9769 } | |
9770 if (kind1 != kind) | |
9771 PyMem_Free(buf1); | |
9772 if (kind2 != kind) | |
9773 PyMem_Free(buf2); | |
9774 return out; | |
9775 } | |
9776 | |
9777 static PyObject * | |
9778 rsplit(PyObject *self, | |
9779 PyObject *substring, | |
9780 Py_ssize_t maxcount) | |
9781 { | |
9782 int kind1, kind2, kind; | |
9783 void *buf1, *buf2; | |
9784 Py_ssize_t len1, len2; | |
9785 PyObject* out; | |
9786 | |
9787 if (maxcount < 0) | |
9788 maxcount = PY_SSIZE_T_MAX; | |
9789 | |
9790 if (PyUnicode_READY(self) == -1) | |
9791 return NULL; | |
9792 | |
9793 if (substring == NULL) | |
9794 switch(PyUnicode_KIND(self)) { | |
9795 case PyUnicode_1BYTE_KIND: | |
9796 if (PyUnicode_IS_ASCII(self)) | |
9797 return asciilib_rsplit_whitespace( | |
9798 self, PyUnicode_1BYTE_DATA(self), | |
9799 PyUnicode_GET_LENGTH(self), maxcount | |
9800 ); | |
9801 else | |
9802 return ucs1lib_rsplit_whitespace( | |
9803 self, PyUnicode_1BYTE_DATA(self), | |
9804 PyUnicode_GET_LENGTH(self), maxcount | |
9805 ); | |
9806 case PyUnicode_2BYTE_KIND: | |
9807 return ucs2lib_rsplit_whitespace( | |
9808 self, PyUnicode_2BYTE_DATA(self), | |
9809 PyUnicode_GET_LENGTH(self), maxcount | |
9810 ); | |
9811 case PyUnicode_4BYTE_KIND: | |
9812 return ucs4lib_rsplit_whitespace( | |
9813 self, PyUnicode_4BYTE_DATA(self), | |
9814 PyUnicode_GET_LENGTH(self), maxcount | |
9815 ); | |
9816 default: | |
9817 assert(0); | |
9818 return NULL; | |
9819 } | |
9820 | |
9821 if (PyUnicode_READY(substring) == -1) | |
9822 return NULL; | |
9823 | |
9824 kind1 = PyUnicode_KIND(self); | |
9825 kind2 = PyUnicode_KIND(substring); | |
9826 kind = kind1 > kind2 ? kind1 : kind2; | |
9827 buf1 = PyUnicode_DATA(self); | |
9828 buf2 = PyUnicode_DATA(substring); | |
9829 if (kind1 != kind) | |
9830 buf1 = _PyUnicode_AsKind(self, kind); | |
9831 if (!buf1) | |
9832 return NULL; | |
9833 if (kind2 != kind) | |
9834 buf2 = _PyUnicode_AsKind(substring, kind); | |
9835 if (!buf2) { | |
9836 if (kind1 != kind) PyMem_Free(buf1); | |
9837 return NULL; | |
9838 } | |
9839 len1 = PyUnicode_GET_LENGTH(self); | |
9840 len2 = PyUnicode_GET_LENGTH(substring); | |
9841 | |
9842 switch(kind) { | |
9843 case PyUnicode_1BYTE_KIND: | |
9844 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) | |
9845 out = asciilib_rsplit( | |
9846 self, buf1, len1, buf2, len2, maxcount); | |
9847 else | |
9848 out = ucs1lib_rsplit( | |
9849 self, buf1, len1, buf2, len2, maxcount); | |
9850 break; | |
9851 case PyUnicode_2BYTE_KIND: | |
9852 out = ucs2lib_rsplit( | |
9853 self, buf1, len1, buf2, len2, maxcount); | |
9854 break; | |
9855 case PyUnicode_4BYTE_KIND: | |
9856 out = ucs4lib_rsplit( | |
9857 self, buf1, len1, buf2, len2, maxcount); | |
9858 break; | |
9859 default: | |
9860 out = NULL; | |
9861 } | |
9862 if (kind1 != kind) | |
9863 PyMem_Free(buf1); | |
9864 if (kind2 != kind) | |
9865 PyMem_Free(buf2); | |
9866 return out; | |
9867 } | |
9868 | |
9869 static Py_ssize_t | |
9870 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, | |
9871 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) | |
9872 { | |
9873 switch(kind) { | |
9874 case PyUnicode_1BYTE_KIND: | |
9875 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) | |
9876 return asciilib_find(buf1, len1, buf2, len2, offset); | |
9877 else | |
9878 return ucs1lib_find(buf1, len1, buf2, len2, offset); | |
9879 case PyUnicode_2BYTE_KIND: | |
9880 return ucs2lib_find(buf1, len1, buf2, len2, offset); | |
9881 case PyUnicode_4BYTE_KIND: | |
9882 return ucs4lib_find(buf1, len1, buf2, len2, offset); | |
9883 } | |
9884 assert(0); | |
9885 return -1; | |
9886 } | |
9887 | |
9888 static Py_ssize_t | |
9889 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, | |
9890 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) | |
9891 { | |
9892 switch(kind) { | |
9893 case PyUnicode_1BYTE_KIND: | |
9894 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) | |
9895 return asciilib_count(sbuf, slen, buf1, len1, maxcount); | |
9896 else | |
9897 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); | |
9898 case PyUnicode_2BYTE_KIND: | |
9899 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); | |
9900 case PyUnicode_4BYTE_KIND: | |
9901 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); | |
9902 } | |
9903 assert(0); | |
9904 return 0; | |
9905 } | |
9906 | |
9907 static PyObject * | |
9908 replace(PyObject *self, PyObject *str1, | |
9909 PyObject *str2, Py_ssize_t maxcount) | |
9910 { | |
9911 PyObject *u; | |
9912 char *sbuf = PyUnicode_DATA(self); | |
9913 char *buf1 = PyUnicode_DATA(str1); | |
9914 char *buf2 = PyUnicode_DATA(str2); | |
9915 int srelease = 0, release1 = 0, release2 = 0; | |
9916 int skind = PyUnicode_KIND(self); | |
9917 int kind1 = PyUnicode_KIND(str1); | |
9918 int kind2 = PyUnicode_KIND(str2); | |
9919 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); | |
9920 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); | |
9921 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); | |
9922 int mayshrink; | |
9923 Py_UCS4 maxchar, maxchar_str2; | |
9924 | |
9925 if (maxcount < 0) | |
9926 maxcount = PY_SSIZE_T_MAX; | |
9927 else if (maxcount == 0 || slen == 0) | |
9928 goto nothing; | |
9929 | |
9930 if (str1 == str2) | |
9931 goto nothing; | |
9932 if (skind < kind1) | |
9933 /* substring too wide to be present */ | |
9934 goto nothing; | |
9935 | |
9936 maxchar = PyUnicode_MAX_CHAR_VALUE(self); | |
9937 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); | |
9938 /* Replacing str1 with str2 may cause a maxchar reduction in the | |
9939 result string. */ | |
9940 mayshrink = (maxchar_str2 < maxchar); | |
9941 maxchar = Py_MAX(maxchar, maxchar_str2); | |
9942 | |
9943 if (len1 == len2) { | |
9944 Py_ssize_t i; | |
9945 /* same length */ | |
9946 if (len1 == 0) | |
9947 goto nothing; | |
9948 if (len1 == 1) { | |
9949 /* replace characters */ | |
9950 Py_UCS4 u1, u2; | |
9951 int rkind; | |
9952 u1 = PyUnicode_READ_CHAR(str1, 0); | |
9953 if (findchar(sbuf, PyUnicode_KIND(self), | |
9954 slen, u1, 1) < 0) | |
9955 goto nothing; | |
9956 u2 = PyUnicode_READ_CHAR(str2, 0); | |
9957 u = PyUnicode_New(slen, maxchar); | |
9958 if (!u) | |
9959 goto error; | |
9960 copy_characters(u, 0, self, 0, slen); | |
9961 rkind = PyUnicode_KIND(u); | |
9962 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) | |
9963 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { | |
9964 if (--maxcount < 0) | |
9965 break; | |
9966 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); | |
9967 } | |
9968 } | |
9969 else { | |
9970 int rkind = skind; | |
9971 char *res; | |
9972 | |
9973 if (kind1 < rkind) { | |
9974 /* widen substring */ | |
9975 buf1 = _PyUnicode_AsKind(str1, rkind); | |
9976 if (!buf1) goto error; | |
9977 release1 = 1; | |
9978 } | |
9979 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); | |
9980 if (i < 0) | |
9981 goto nothing; | |
9982 if (rkind > kind2) { | |
9983 /* widen replacement */ | |
9984 buf2 = _PyUnicode_AsKind(str2, rkind); | |
9985 if (!buf2) goto error; | |
9986 release2 = 1; | |
9987 } | |
9988 else if (rkind < kind2) { | |
9989 /* widen self and buf1 */ | |
9990 rkind = kind2; | |
9991 if (release1) PyMem_Free(buf1); | |
9992 sbuf = _PyUnicode_AsKind(self, rkind); | |
9993 if (!sbuf) goto error; | |
9994 srelease = 1; | |
9995 buf1 = _PyUnicode_AsKind(str1, rkind); | |
9996 if (!buf1) goto error; | |
9997 release1 = 1; | |
9998 } | |
9999 u = PyUnicode_New(slen, maxchar); | |
10000 if (!u) | |
10001 goto error; | |
10002 assert(PyUnicode_KIND(u) == rkind); | |
10003 res = PyUnicode_DATA(u); | |
10004 | |
10005 memcpy(res, sbuf, rkind * slen); | |
10006 /* change everything in-place, starting with this one */ | |
10007 memcpy(res + rkind * i, | |
10008 buf2, | |
10009 rkind * len2); | |
10010 i += len1; | |
10011 | |
10012 while ( --maxcount > 0) { | |
10013 i = anylib_find(rkind, self, | |
10014 sbuf+rkind*i, slen-i, | |
10015 str1, buf1, len1, i); | |
10016 if (i == -1) | |
10017 break; | |
10018 memcpy(res + rkind * i, | |
10019 buf2, | |
10020 rkind * len2); | |
10021 i += len1; | |
10022 } | |
10023 } | |
10024 } | |
10025 else { | |
10026 Py_ssize_t n, i, j, ires; | |
10027 Py_ssize_t product, new_size; | |
10028 int rkind = skind; | |
10029 char *res; | |
10030 | |
10031 if (kind1 < rkind) { | |
10032 /* widen substring */ | |
10033 buf1 = _PyUnicode_AsKind(str1, rkind); | |
10034 if (!buf1) goto error; | |
10035 release1 = 1; | |
10036 } | |
10037 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); | |
10038 if (n == 0) | |
10039 goto nothing; | |
10040 if (kind2 < rkind) { | |
10041 /* widen replacement */ | |
10042 buf2 = _PyUnicode_AsKind(str2, rkind); | |
10043 if (!buf2) goto error; | |
10044 release2 = 1; | |
10045 } | |
10046 else if (kind2 > rkind) { | |
10047 /* widen self and buf1 */ | |
10048 rkind = kind2; | |
10049 sbuf = _PyUnicode_AsKind(self, rkind); | |
10050 if (!sbuf) goto error; | |
10051 srelease = 1; | |
10052 if (release1) PyMem_Free(buf1); | |
10053 buf1 = _PyUnicode_AsKind(str1, rkind); | |
10054 if (!buf1) goto error; | |
10055 release1 = 1; | |
10056 } | |
10057 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - | |
10058 PyUnicode_GET_LENGTH(str1))); */ | |
10059 product = n * (len2-len1); | |
10060 if ((product / (len2-len1)) != n) { | |
10061 PyErr_SetString(PyExc_OverflowError, | |
10062 "replace string is too long"); | |
10063 goto error; | |
10064 } | |
10065 new_size = slen + product; | |
10066 if (new_size == 0) { | |
10067 Py_INCREF(unicode_empty); | |
10068 u = unicode_empty; | |
10069 goto done; | |
10070 } | |
10071 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { | |
10072 PyErr_SetString(PyExc_OverflowError, | |
10073 "replace string is too long"); | |
10074 goto error; | |
10075 } | |
10076 u = PyUnicode_New(new_size, maxchar); | |
10077 if (!u) | |
10078 goto error; | |
10079 assert(PyUnicode_KIND(u) == rkind); | |
10080 res = PyUnicode_DATA(u); | |
10081 ires = i = 0; | |
10082 if (len1 > 0) { | |
10083 while (n-- > 0) { | |
10084 /* look for next match */ | |
10085 j = anylib_find(rkind, self, | |
10086 sbuf + rkind * i, slen-i, | |
10087 str1, buf1, len1, i); | |
10088 if (j == -1) | |
10089 break; | |
10090 else if (j > i) { | |
10091 /* copy unchanged part [i:j] */ | |
10092 memcpy(res + rkind * ires, | |
10093 sbuf + rkind * i, | |
10094 rkind * (j-i)); | |
10095 ires += j - i; | |
10096 } | |
10097 /* copy substitution string */ | |
10098 if (len2 > 0) { | |
10099 memcpy(res + rkind * ires, | |
10100 buf2, | |
10101 rkind * len2); | |
10102 ires += len2; | |
10103 } | |
10104 i = j + len1; | |
10105 } | |
10106 if (i < slen) | |
10107 /* copy tail [i:] */ | |
10108 memcpy(res + rkind * ires, | |
10109 sbuf + rkind * i, | |
10110 rkind * (slen-i)); | |
10111 } | |
10112 else { | |
10113 /* interleave */ | |
10114 while (n > 0) { | |
10115 memcpy(res + rkind * ires, | |
10116 buf2, | |
10117 rkind * len2); | |
10118 ires += len2; | |
10119 if (--n <= 0) | |
10120 break; | |
10121 memcpy(res + rkind * ires, | |
10122 sbuf + rkind * i, | |
10123 rkind); | |
10124 ires++; | |
10125 i++; | |
10126 } | |
10127 memcpy(res + rkind * ires, | |
10128 sbuf + rkind * i, | |
10129 rkind * (slen-i)); | |
10130 } | |
10131 } | |
10132 | |
10133 if (mayshrink) { | |
10134 unicode_adjust_maxchar(&u); | |
10135 if (u == NULL) | |
10136 goto error; | |
10137 } | |
10138 | |
10139 done: | |
10140 if (srelease) | |
10141 PyMem_FREE(sbuf); | |
10142 if (release1) | |
10143 PyMem_FREE(buf1); | |
10144 if (release2) | |
10145 PyMem_FREE(buf2); | |
10146 assert(_PyUnicode_CheckConsistency(u, 1)); | |
10147 return u; | |
10148 | |
10149 nothing: | |
10150 /* nothing to replace; return original string (when possible) */ | |
10151 if (srelease) | |
10152 PyMem_FREE(sbuf); | |
10153 if (release1) | |
10154 PyMem_FREE(buf1); | |
10155 if (release2) | |
10156 PyMem_FREE(buf2); | |
10157 if (PyUnicode_CheckExact(self)) { | |
10158 Py_INCREF(self); | |
10159 return self; | |
10160 } | |
10161 return PyUnicode_Copy(self); | |
10162 error: | |
10163 if (srelease && sbuf) | |
10164 PyMem_FREE(sbuf); | |
10165 if (release1 && buf1) | |
10166 PyMem_FREE(buf1); | |
10167 if (release2 && buf2) | |
10168 PyMem_FREE(buf2); | |
10169 return NULL; | |
10170 } | |
10171 | |
10172 /* --- Unicode Object Methods --------------------------------------------- */ | |
10173 | |
10174 PyDoc_STRVAR(title__doc__, | |
10175 "S.title() -> str\n\ | |
10176 \n\ | |
10177 Return a titlecased version of S, i.e. words start with title case\n\ | |
10178 characters, all remaining cased characters have lower case."); | |
10179 | |
10180 static PyObject* | |
10181 unicode_title(PyObject *self) | |
10182 { | |
10183 return fixup(self, fixtitle); | |
10184 } | |
10185 | |
10186 PyDoc_STRVAR(capitalize__doc__, | |
10187 "S.capitalize() -> str\n\ | |
10188 \n\ | |
10189 Return a capitalized version of S, i.e. make the first character\n\ | |
10190 have upper case and the rest lower case."); | |
10191 | |
10192 static PyObject* | |
10193 unicode_capitalize(PyObject *self) | |
10194 { | |
10195 return fixup(self, fixcapitalize); | |
10196 } | |
10197 | |
10198 #if 0 | |
10199 PyDoc_STRVAR(capwords__doc__, | |
10200 "S.capwords() -> str\n\ | |
10201 \n\ | |
10202 Apply .capitalize() to all words in S and return the result with\n\ | |
10203 normalized whitespace (all whitespace strings are replaced by ' ')."); | |
10204 | |
10205 static PyObject* | |
10206 unicode_capwords(PyObject *self) | |
10207 { | |
10208 PyObject *list; | |
10209 PyObject *item; | |
10210 Py_ssize_t i; | |
10211 | |
10212 /* Split into words */ | |
10213 list = split(self, NULL, -1); | |
10214 if (!list) | |
10215 return NULL; | |
10216 | |
10217 /* Capitalize each word */ | |
10218 for (i = 0; i < PyList_GET_SIZE(list); i++) { | |
10219 item = fixup(PyList_GET_ITEM(list, i), | |
10220 fixcapitalize); | |
10221 if (item == NULL) | |
10222 goto onError; | |
10223 Py_DECREF(PyList_GET_ITEM(list, i)); | |
10224 PyList_SET_ITEM(list, i, item); | |
10225 } | |
10226 | |
10227 /* Join the words to form a new string */ | |
10228 item = PyUnicode_Join(NULL, list); | |
10229 | |
10230 onError: | |
10231 Py_DECREF(list); | |
10232 return item; | |
10233 } | |
10234 #endif | |
10235 | |
10236 /* Argument converter. Coerces to a single unicode character */ | |
10237 | |
10238 static int | |
10239 convert_uc(PyObject *obj, void *addr) | |
10240 { | |
10241 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; | |
10242 PyObject *uniobj; | |
10243 | |
10244 uniobj = PyUnicode_FromObject(obj); | |
10245 if (uniobj == NULL) { | |
10246 PyErr_SetString(PyExc_TypeError, | |
10247 "The fill character cannot be converted to Unicode"); | |
10248 return 0; | |
10249 } | |
10250 if (PyUnicode_GET_LENGTH(uniobj) != 1) { | |
10251 PyErr_SetString(PyExc_TypeError, | |
10252 "The fill character must be exactly one character long"); | |
10253 Py_DECREF(uniobj); | |
10254 return 0; | |
10255 } | |
10256 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); | |
10257 Py_DECREF(uniobj); | |
10258 return 1; | |
10259 } | |
10260 | |
10261 PyDoc_STRVAR(center__doc__, | |
10262 "S.center(width[, fillchar]) -> str\n\ | |
10263 \n\ | |
10264 Return S centered in a string of length width. Padding is\n\ | |
10265 done using the specified fill character (default is a space)"); | |
10266 | |
10267 static PyObject * | |
10268 unicode_center(PyObject *self, PyObject *args) | |
10269 { | |
10270 Py_ssize_t marg, left; | |
10271 Py_ssize_t width; | |
10272 Py_UCS4 fillchar = ' '; | |
10273 | |
10274 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) | |
10275 return NULL; | |
10276 | |
10277 if (PyUnicode_READY(self) == -1) | |
10278 return NULL; | |
10279 | |
10280 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { | |
10281 Py_INCREF(self); | |
10282 return self; | |
10283 } | |
10284 | |
10285 marg = width - _PyUnicode_LENGTH(self); | |
10286 left = marg / 2 + (marg & width & 1); | |
10287 | |
10288 return pad(self, left, marg - left, fillchar); | |
10289 } | |
10290 | |
10291 /* This function assumes that str1 and str2 are readied by the caller. */ | |
10292 | |
10293 static int | |
10294 unicode_compare(PyObject *str1, PyObject *str2) | |
10295 { | |
10296 int kind1, kind2; | |
10297 void *data1, *data2; | |
10298 Py_ssize_t len1, len2, i; | |
10299 | |
10300 kind1 = PyUnicode_KIND(str1); | |
10301 kind2 = PyUnicode_KIND(str2); | |
10302 data1 = PyUnicode_DATA(str1); | |
10303 data2 = PyUnicode_DATA(str2); | |
10304 len1 = PyUnicode_GET_LENGTH(str1); | |
10305 len2 = PyUnicode_GET_LENGTH(str2); | |
10306 | |
10307 for (i = 0; i < len1 && i < len2; ++i) { | |
10308 Py_UCS4 c1, c2; | |
10309 c1 = PyUnicode_READ(kind1, data1, i); | |
10310 c2 = PyUnicode_READ(kind2, data2, i); | |
10311 | |
10312 if (c1 != c2) | |
10313 return (c1 < c2) ? -1 : 1; | |
10314 } | |
10315 | |
10316 return (len1 < len2) ? -1 : (len1 != len2); | |
10317 } | |
10318 | |
10319 int | |
10320 PyUnicode_Compare(PyObject *left, PyObject *right) | |
10321 { | |
10322 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { | |
10323 if (PyUnicode_READY(left) == -1 || | |
10324 PyUnicode_READY(right) == -1) | |
10325 return -1; | |
10326 return unicode_compare(left, right); | |
10327 } | |
10328 PyErr_Format(PyExc_TypeError, | |
10329 "Can't compare %.100s and %.100s", | |
10330 left->ob_type->tp_name, | |
10331 right->ob_type->tp_name); | |
10332 return -1; | |
10333 } | |
10334 | |
10335 int | |
10336 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) | |
10337 { | |
10338 Py_ssize_t i; | |
10339 int kind; | |
10340 void *data; | |
10341 Py_UCS4 chr; | |
10342 | |
10343 assert(_PyUnicode_CHECK(uni)); | |
10344 if (PyUnicode_READY(uni) == -1) | |
10345 return -1; | |
10346 kind = PyUnicode_KIND(uni); | |
10347 data = PyUnicode_DATA(uni); | |
10348 /* Compare Unicode string and source character set string */ | |
10349 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) | |
10350 if (chr != str[i]) | |
10351 return (chr < (unsigned char)(str[i])) ? -1 : 1; | |
10352 /* This check keeps Python strings that end in '\0' from comparing equal | |
10353 to C strings identical up to that point. */ | |
10354 if (PyUnicode_GET_LENGTH(uni) != i || chr) | |
10355 return 1; /* uni is longer */ | |
10356 if (str[i]) | |
10357 return -1; /* str is longer */ | |
10358 return 0; | |
10359 } | |
10360 | |
10361 | |
10362 #define TEST_COND(cond) \ | |
10363 ((cond) ? Py_True : Py_False) | |
10364 | |
10365 PyObject * | |
10366 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) | |
10367 { | |
10368 int result; | |
10369 | |
10370 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { | |
10371 PyObject *v; | |
10372 if (PyUnicode_READY(left) == -1 || | |
10373 PyUnicode_READY(right) == -1) | |
10374 return NULL; | |
10375 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || | |
10376 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { | |
10377 if (op == Py_EQ) { | |
10378 Py_INCREF(Py_False); | |
10379 return Py_False; | |
10380 } | |
10381 if (op == Py_NE) { | |
10382 Py_INCREF(Py_True); | |
10383 return Py_True; | |
10384 } | |
10385 } | |
10386 if (left == right) | |
10387 result = 0; | |
10388 else | |
10389 result = unicode_compare(left, right); | |
10390 | |
10391 /* Convert the return value to a Boolean */ | |
10392 switch (op) { | |
10393 case Py_EQ: | |
10394 v = TEST_COND(result == 0); | |
10395 break; | |
10396 case Py_NE: | |
10397 v = TEST_COND(result != 0); | |
10398 break; | |
10399 case Py_LE: | |
10400 v = TEST_COND(result <= 0); | |
10401 break; | |
10402 case Py_GE: | |
10403 v = TEST_COND(result >= 0); | |
10404 break; | |
10405 case Py_LT: | |
10406 v = TEST_COND(result == -1); | |
10407 break; | |
10408 case Py_GT: | |
10409 v = TEST_COND(result == 1); | |
10410 break; | |
10411 default: | |
10412 PyErr_BadArgument(); | |
10413 return NULL; | |
10414 } | |
10415 Py_INCREF(v); | |
10416 return v; | |
10417 } | |
10418 | |
10419 Py_RETURN_NOTIMPLEMENTED; | |
10420 } | |
10421 | |
10422 int | |
10423 PyUnicode_Contains(PyObject *container, PyObject *element) | |
10424 { | |
10425 PyObject *str, *sub; | |
10426 int kind1, kind2, kind; | |
10427 void *buf1, *buf2; | |
10428 Py_ssize_t len1, len2; | |
10429 int result; | |
10430 | |
10431 /* Coerce the two arguments */ | |
10432 sub = PyUnicode_FromObject(element); | |
10433 if (!sub) { | |
10434 PyErr_Format(PyExc_TypeError, | |
10435 "'in <string>' requires string as left operand, not %s", | |
10436 element->ob_type->tp_name); | |
10437 return -1; | |
10438 } | |
10439 if (PyUnicode_READY(sub) == -1) | |
10440 return -1; | |
10441 | |
10442 str = PyUnicode_FromObject(container); | |
10443 if (!str || PyUnicode_READY(str) == -1) { | |
10444 Py_DECREF(sub); | |
10445 return -1; | |
10446 } | |
10447 | |
10448 kind1 = PyUnicode_KIND(str); | |
10449 kind2 = PyUnicode_KIND(sub); | |
10450 kind = kind1 > kind2 ? kind1 : kind2; | |
10451 buf1 = PyUnicode_DATA(str); | |
10452 buf2 = PyUnicode_DATA(sub); | |
10453 if (kind1 != kind) | |
10454 buf1 = _PyUnicode_AsKind(str, kind); | |
10455 if (!buf1) { | |
10456 Py_DECREF(sub); | |
10457 return -1; | |
10458 } | |
10459 if (kind2 != kind) | |
10460 buf2 = _PyUnicode_AsKind(sub, kind); | |
10461 if (!buf2) { | |
10462 Py_DECREF(sub); | |
10463 if (kind1 != kind) PyMem_Free(buf1); | |
10464 return -1; | |
10465 } | |
10466 len1 = PyUnicode_GET_LENGTH(str); | |
10467 len2 = PyUnicode_GET_LENGTH(sub); | |
10468 | |
10469 switch(kind) { | |
10470 case PyUnicode_1BYTE_KIND: | |
10471 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; | |
10472 break; | |
10473 case PyUnicode_2BYTE_KIND: | |
10474 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; | |
10475 break; | |
10476 case PyUnicode_4BYTE_KIND: | |
10477 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; | |
10478 break; | |
10479 default: | |
10480 result = -1; | |
10481 assert(0); | |
10482 } | |
10483 | |
10484 Py_DECREF(str); | |
10485 Py_DECREF(sub); | |
10486 | |
10487 if (kind1 != kind) | |
10488 PyMem_Free(buf1); | |
10489 if (kind2 != kind) | |
10490 PyMem_Free(buf2); | |
10491 | |
10492 return result; | |
10493 } | |
10494 | |
10495 /* Concat to string or Unicode object giving a new Unicode object. */ | |
10496 | |
10497 PyObject * | |
10498 PyUnicode_Concat(PyObject *left, PyObject *right) | |
10499 { | |
10500 PyObject *u = NULL, *v = NULL, *w; | |
10501 Py_UCS4 maxchar, maxchar2; | |
10502 | |
10503 /* Coerce the two arguments */ | |
10504 u = PyUnicode_FromObject(left); | |
10505 if (u == NULL) | |
10506 goto onError; | |
10507 v = PyUnicode_FromObject(right); | |
10508 if (v == NULL) | |
10509 goto onError; | |
10510 | |
10511 /* Shortcuts */ | |
10512 if (v == unicode_empty) { | |
10513 Py_DECREF(v); | |
10514 return u; | |
10515 } | |
10516 if (u == unicode_empty) { | |
10517 Py_DECREF(u); | |
10518 return v; | |
10519 } | |
10520 | |
10521 maxchar = PyUnicode_MAX_CHAR_VALUE(u); | |
10522 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); | |
10523 maxchar = Py_MAX(maxchar, maxchar2); | |
10524 | |
10525 /* Concat the two Unicode strings */ | |
10526 w = PyUnicode_New( | |
10527 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), | |
10528 maxchar); | |
10529 if (w == NULL) | |
10530 goto onError; | |
10531 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)); | |
10532 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v)); | |
10533 Py_DECREF(u); | |
10534 Py_DECREF(v); | |
10535 assert(_PyUnicode_CheckConsistency(w, 1)); | |
10536 return w; | |
10537 | |
10538 onError: | |
10539 Py_XDECREF(u); | |
10540 Py_XDECREF(v); | |
10541 return NULL; | |
10542 } | |
10543 | |
10544 static void | |
10545 unicode_append_inplace(PyObject **p_left, PyObject *right) | |
10546 { | |
10547 Py_ssize_t left_len, right_len, new_len; | |
10548 | |
10549 assert(PyUnicode_IS_READY(*p_left)); | |
10550 assert(PyUnicode_IS_READY(right)); | |
10551 | |
10552 left_len = PyUnicode_GET_LENGTH(*p_left); | |
10553 right_len = PyUnicode_GET_LENGTH(right); | |
10554 if (left_len > PY_SSIZE_T_MAX - right_len) { | |
10555 PyErr_SetString(PyExc_OverflowError, | |
10556 "strings are too large to concat"); | |
10557 goto error; | |
10558 } | |
10559 new_len = left_len + right_len; | |
10560 | |
10561 /* Now we own the last reference to 'left', so we can resize it | |
10562 * in-place. | |
10563 */ | |
10564 if (unicode_resize(p_left, new_len) != 0) { | |
10565 /* XXX if _PyUnicode_Resize() fails, 'left' has been | |
10566 * deallocated so it cannot be put back into | |
10567 * 'variable'. The MemoryError is raised when there | |
10568 * is no value in 'variable', which might (very | |
10569 * remotely) be a cause of incompatibilities. | |
10570 */ | |
10571 goto error; | |
10572 } | |
10573 /* copy 'right' into the newly allocated area of 'left' */ | |
10574 copy_characters(*p_left, left_len, right, 0, right_len); | |
10575 _PyUnicode_DIRTY(*p_left); | |
10576 return; | |
10577 | |
10578 error: | |
10579 Py_DECREF(*p_left); | |
10580 *p_left = NULL; | |
10581 } | |
10582 | |
10583 void | |
10584 PyUnicode_Append(PyObject **p_left, PyObject *right) | |
10585 { | |
10586 PyObject *left, *res; | |
10587 | |
10588 if (p_left == NULL) { | |
10589 if (!PyErr_Occurred()) | |
10590 PyErr_BadInternalCall(); | |
10591 return; | |
10592 } | |
10593 left = *p_left; | |
10594 if (right == NULL || !PyUnicode_Check(left)) { | |
10595 if (!PyErr_Occurred()) | |
10596 PyErr_BadInternalCall(); | |
10597 goto error; | |
10598 } | |
10599 | |
10600 if (PyUnicode_READY(left)) | |
10601 goto error; | |
10602 if (PyUnicode_READY(right)) | |
10603 goto error; | |
10604 | |
10605 if (PyUnicode_CheckExact(left) && left != unicode_empty | |
10606 && PyUnicode_CheckExact(right) && right != unicode_empty | |
10607 && unicode_resizable(left) | |
10608 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) | |
10609 || _PyUnicode_WSTR(left) != NULL)) | |
10610 { | |
10611 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires | |
10612 to change the structure size, but characters are stored just after | |
10613 the structure, and so it requires to move all characters which is | |
10614 not so different than duplicating the string. */ | |
10615 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) | |
10616 { | |
10617 unicode_append_inplace(p_left, right); | |
10618 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1)); | |
10619 return; | |
10620 } | |
10621 } | |
10622 | |
10623 res = PyUnicode_Concat(left, right); | |
10624 if (res == NULL) | |
10625 goto error; | |
10626 Py_DECREF(left); | |
10627 *p_left = res; | |
10628 return; | |
10629 | |
10630 error: | |
10631 Py_DECREF(*p_left); | |
10632 *p_left = NULL; | |
10633 } | |
10634 | |
10635 void | |
10636 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) | |
10637 { | |
10638 PyUnicode_Append(pleft, right); | |
10639 Py_XDECREF(right); | |
10640 } | |
10641 | |
10642 PyDoc_STRVAR(count__doc__, | |
10643 "S.count(sub[, start[, end]]) -> int\n\ | |
10644 \n\ | |
10645 Return the number of non-overlapping occurrences of substring sub in\n\ | |
10646 string S[start:end]. Optional arguments start and end are\n\ | |
10647 interpreted as in slice notation."); | |
10648 | |
10649 static PyObject * | |
10650 unicode_count(PyObject *self, PyObject *args) | |
10651 { | |
10652 PyObject *substring; | |
10653 Py_ssize_t start = 0; | |
10654 Py_ssize_t end = PY_SSIZE_T_MAX; | |
10655 PyObject *result; | |
10656 int kind1, kind2, kind; | |
10657 void *buf1, *buf2; | |
10658 Py_ssize_t len1, len2, iresult; | |
10659 | |
10660 if (!stringlib_parse_args_finds_unicode("count", args, &substring, | |
10661 &start, &end)) | |
10662 return NULL; | |
10663 | |
10664 kind1 = PyUnicode_KIND(self); | |
10665 kind2 = PyUnicode_KIND(substring); | |
10666 kind = kind1 > kind2 ? kind1 : kind2; | |
10667 buf1 = PyUnicode_DATA(self); | |
10668 buf2 = PyUnicode_DATA(substring); | |
10669 if (kind1 != kind) | |
10670 buf1 = _PyUnicode_AsKind(self, kind); | |
10671 if (!buf1) { | |
10672 Py_DECREF(substring); | |
10673 return NULL; | |
10674 } | |
10675 if (kind2 != kind) | |
10676 buf2 = _PyUnicode_AsKind(substring, kind); | |
10677 if (!buf2) { | |
10678 Py_DECREF(substring); | |
10679 if (kind1 != kind) PyMem_Free(buf1); | |
10680 return NULL; | |
10681 } | |
10682 len1 = PyUnicode_GET_LENGTH(self); | |
10683 len2 = PyUnicode_GET_LENGTH(substring); | |
10684 | |
10685 ADJUST_INDICES(start, end, len1); | |
10686 switch(kind) { | |
10687 case PyUnicode_1BYTE_KIND: | |
10688 iresult = ucs1lib_count( | |
10689 ((Py_UCS1*)buf1) + start, end - start, | |
10690 buf2, len2, PY_SSIZE_T_MAX | |
10691 ); | |
10692 break; | |
10693 case PyUnicode_2BYTE_KIND: | |
10694 iresult = ucs2lib_count( | |
10695 ((Py_UCS2*)buf1) + start, end - start, | |
10696 buf2, len2, PY_SSIZE_T_MAX | |
10697 ); | |
10698 break; | |
10699 case PyUnicode_4BYTE_KIND: | |
10700 iresult = ucs4lib_count( | |
10701 ((Py_UCS4*)buf1) + start, end - start, | |
10702 buf2, len2, PY_SSIZE_T_MAX | |
10703 ); | |
10704 break; | |
10705 default: | |
10706 assert(0); iresult = 0; | |
10707 } | |
10708 | |
10709 result = PyLong_FromSsize_t(iresult); | |
10710 | |
10711 if (kind1 != kind) | |
10712 PyMem_Free(buf1); | |
10713 if (kind2 != kind) | |
10714 PyMem_Free(buf2); | |
10715 | |
10716 Py_DECREF(substring); | |
10717 | |
10718 return result; | |
10719 } | |
10720 | |
10721 PyDoc_STRVAR(encode__doc__, | |
10722 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ | |
10723 \n\ | |
10724 Encode S using the codec registered for encoding. Default encoding\n\ | |
10725 is 'utf-8'. errors may be given to set a different error\n\ | |
10726 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ | |
10727 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ | |
10728 'xmlcharrefreplace' as well as any other name registered with\n\ | |
10729 codecs.register_error that can handle UnicodeEncodeErrors."); | |
10730 | |
10731 static PyObject * | |
10732 unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) | |
10733 { | |
10734 static char *kwlist[] = {"encoding", "errors", 0}; | |
10735 char *encoding = NULL; | |
10736 char *errors = NULL; | |
10737 | |
10738 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", | |
10739 kwlist, &encoding, &errors)) | |
10740 return NULL; | |
10741 return PyUnicode_AsEncodedString(self, encoding, errors); | |
10742 } | |
10743 | |
10744 PyDoc_STRVAR(expandtabs__doc__, | |
10745 "S.expandtabs([tabsize]) -> str\n\ | |
10746 \n\ | |
10747 Return a copy of S where all tab characters are expanded using spaces.\n\ | |
10748 If tabsize is not given, a tab size of 8 characters is assumed."); | |
10749 | |
10750 static PyObject* | |
10751 unicode_expandtabs(PyObject *self, PyObject *args) | |
10752 { | |
10753 Py_ssize_t i, j, line_pos, src_len, incr; | |
10754 Py_UCS4 ch; | |
10755 PyObject *u; | |
10756 void *src_data, *dest_data; | |
10757 int tabsize = 8; | |
10758 int kind; | |
10759 int found; | |
10760 | |
10761 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) | |
10762 return NULL; | |
10763 | |
10764 if (PyUnicode_READY(self) == -1) | |
10765 return NULL; | |
10766 | |
10767 /* First pass: determine size of output string */ | |
10768 src_len = PyUnicode_GET_LENGTH(self); | |
10769 i = j = line_pos = 0; | |
10770 kind = PyUnicode_KIND(self); | |
10771 src_data = PyUnicode_DATA(self); | |
10772 found = 0; | |
10773 for (; i < src_len; i++) { | |
10774 ch = PyUnicode_READ(kind, src_data, i); | |
10775 if (ch == '\t') { | |
10776 found = 1; | |
10777 if (tabsize > 0) { | |
10778 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ | |
10779 if (j > PY_SSIZE_T_MAX - incr) | |
10780 goto overflow; | |
10781 line_pos += incr; | |
10782 j += incr; | |
10783 } | |
10784 } | |
10785 else { | |
10786 if (j > PY_SSIZE_T_MAX - 1) | |
10787 goto overflow; | |
10788 line_pos++; | |
10789 j++; | |
10790 if (ch == '\n' || ch == '\r') | |
10791 line_pos = 0; | |
10792 } | |
10793 } | |
10794 if (!found && PyUnicode_CheckExact(self)) { | |
10795 Py_INCREF(self); | |
10796 return self; | |
10797 } | |
10798 | |
10799 /* Second pass: create output string and fill it */ | |
10800 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); | |
10801 if (!u) | |
10802 return NULL; | |
10803 dest_data = PyUnicode_DATA(u); | |
10804 | |
10805 i = j = line_pos = 0; | |
10806 | |
10807 for (; i < src_len; i++) { | |
10808 ch = PyUnicode_READ(kind, src_data, i); | |
10809 if (ch == '\t') { | |
10810 if (tabsize > 0) { | |
10811 incr = tabsize - (line_pos % tabsize); | |
10812 line_pos += incr; | |
10813 while (incr--) { | |
10814 PyUnicode_WRITE(kind, dest_data, j, ' '); | |
10815 j++; | |
10816 } | |
10817 } | |
10818 } | |
10819 else { | |
10820 line_pos++; | |
10821 PyUnicode_WRITE(kind, dest_data, j, ch); | |
10822 j++; | |
10823 if (ch == '\n' || ch == '\r') | |
10824 line_pos = 0; | |
10825 } | |
10826 } | |
10827 assert (j == PyUnicode_GET_LENGTH(u)); | |
10828 return unicode_result(u); | |
10829 | |
10830 overflow: | |
10831 PyErr_SetString(PyExc_OverflowError, "new string is too long"); | |
10832 return NULL; | |
10833 } | |
10834 | |
10835 PyDoc_STRVAR(find__doc__, | |
10836 "S.find(sub[, start[, end]]) -> int\n\ | |
10837 \n\ | |
10838 Return the lowest index in S where substring sub is found,\n\ | |
10839 such that sub is contained within S[start:end]. Optional\n\ | |
10840 arguments start and end are interpreted as in slice notation.\n\ | |
10841 \n\ | |
10842 Return -1 on failure."); | |
10843 | |
10844 static PyObject * | |
10845 unicode_find(PyObject *self, PyObject *args) | |
10846 { | |
10847 PyObject *substring; | |
10848 Py_ssize_t start; | |
10849 Py_ssize_t end; | |
10850 Py_ssize_t result; | |
10851 | |
10852 if (!stringlib_parse_args_finds_unicode("find", args, &substring, | |
10853 &start, &end)) | |
10854 return NULL; | |
10855 | |
10856 if (PyUnicode_READY(self) == -1) | |
10857 return NULL; | |
10858 if (PyUnicode_READY(substring) == -1) | |
10859 return NULL; | |
10860 | |
10861 result = any_find_slice(1, self, substring, start, end); | |
10862 | |
10863 Py_DECREF(substring); | |
10864 | |
10865 if (result == -2) | |
10866 return NULL; | |
10867 | |
10868 return PyLong_FromSsize_t(result); | |
10869 } | |
10870 | |
10871 static PyObject * | |
10872 unicode_getitem(PyObject *self, Py_ssize_t index) | |
10873 { | |
10874 Py_UCS4 ch = PyUnicode_ReadChar(self, index); | |
10875 if (ch == (Py_UCS4)-1) | |
10876 return NULL; | |
10877 return PyUnicode_FromOrdinal(ch); | |
10878 } | |
10879 | |
10880 /* Believe it or not, this produces the same value for ASCII strings | |
10881 as bytes_hash(). */ | |
10882 static Py_hash_t | |
10883 unicode_hash(PyObject *self) | |
10884 { | |
10885 Py_ssize_t len; | |
10886 Py_uhash_t x; | |
10887 | |
10888 if (_PyUnicode_HASH(self) != -1) | |
10889 return _PyUnicode_HASH(self); | |
10890 if (PyUnicode_READY(self) == -1) | |
10891 return -1; | |
10892 len = PyUnicode_GET_LENGTH(self); | |
10893 | |
10894 /* The hash function as a macro, gets expanded three times below. */ | |
10895 #define HASH(P) \ | |
10896 x = (Py_uhash_t)*P << 7; \ | |
10897 while (--len >= 0) \ | |
10898 x = (1000003*x) ^ (Py_uhash_t)*P++; | |
10899 | |
10900 switch (PyUnicode_KIND(self)) { | |
10901 case PyUnicode_1BYTE_KIND: { | |
10902 const unsigned char *c = PyUnicode_1BYTE_DATA(self); | |
10903 HASH(c); | |
10904 break; | |
10905 } | |
10906 case PyUnicode_2BYTE_KIND: { | |
10907 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); | |
10908 HASH(s); | |
10909 break; | |
10910 } | |
10911 default: { | |
10912 Py_UCS4 *l; | |
10913 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && | |
10914 "Impossible switch case in unicode_hash"); | |
10915 l = PyUnicode_4BYTE_DATA(self); | |
10916 HASH(l); | |
10917 break; | |
10918 } | |
10919 } | |
10920 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); | |
10921 | |
10922 if (x == -1) | |
10923 x = -2; | |
10924 _PyUnicode_HASH(self) = x; | |
10925 return x; | |
10926 } | |
10927 #undef HASH | |
10928 | |
10929 PyDoc_STRVAR(index__doc__, | |
10930 "S.index(sub[, start[, end]]) -> int\n\ | |
10931 \n\ | |
10932 Like S.find() but raise ValueError when the substring is not found."); | |
10933 | |
10934 static PyObject * | |
10935 unicode_index(PyObject *self, PyObject *args) | |
10936 { | |
10937 Py_ssize_t result; | |
10938 PyObject *substring; | |
10939 Py_ssize_t start; | |
10940 Py_ssize_t end; | |
10941 | |
10942 if (!stringlib_parse_args_finds_unicode("index", args, &substring, | |
10943 &start, &end)) | |
10944 return NULL; | |
10945 | |
10946 if (PyUnicode_READY(self) == -1) | |
10947 return NULL; | |
10948 if (PyUnicode_READY(substring) == -1) | |
10949 return NULL; | |
10950 | |
10951 result = any_find_slice(1, self, substring, start, end); | |
10952 | |
10953 Py_DECREF(substring); | |
10954 | |
10955 if (result == -2) | |
10956 return NULL; | |
10957 | |
10958 if (result < 0) { | |
10959 PyErr_SetString(PyExc_ValueError, "substring not found"); | |
10960 return NULL; | |
10961 } | |
10962 | |
10963 return PyLong_FromSsize_t(result); | |
10964 } | |
10965 | |
10966 PyDoc_STRVAR(islower__doc__, | |
10967 "S.islower() -> bool\n\ | |
10968 \n\ | |
10969 Return True if all cased characters in S are lowercase and there is\n\ | |
10970 at least one cased character in S, False otherwise."); | |
10971 | |
10972 static PyObject* | |
10973 unicode_islower(PyObject *self) | |
10974 { | |
10975 Py_ssize_t i, length; | |
10976 int kind; | |
10977 void *data; | |
10978 int cased; | |
10979 | |
10980 if (PyUnicode_READY(self) == -1) | |
10981 return NULL; | |
10982 length = PyUnicode_GET_LENGTH(self); | |
10983 kind = PyUnicode_KIND(self); | |
10984 data = PyUnicode_DATA(self); | |
10985 | |
10986 /* Shortcut for single character strings */ | |
10987 if (length == 1) | |
10988 return PyBool_FromLong( | |
10989 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); | |
10990 | |
10991 /* Special case for empty strings */ | |
10992 if (length == 0) | |
10993 return PyBool_FromLong(0); | |
10994 | |
10995 cased = 0; | |
10996 for (i = 0; i < length; i++) { | |
10997 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
10998 | |
10999 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) | |
11000 return PyBool_FromLong(0); | |
11001 else if (!cased && Py_UNICODE_ISLOWER(ch)) | |
11002 cased = 1; | |
11003 } | |
11004 return PyBool_FromLong(cased); | |
11005 } | |
11006 | |
11007 PyDoc_STRVAR(isupper__doc__, | |
11008 "S.isupper() -> bool\n\ | |
11009 \n\ | |
11010 Return True if all cased characters in S are uppercase and there is\n\ | |
11011 at least one cased character in S, False otherwise."); | |
11012 | |
11013 static PyObject* | |
11014 unicode_isupper(PyObject *self) | |
11015 { | |
11016 Py_ssize_t i, length; | |
11017 int kind; | |
11018 void *data; | |
11019 int cased; | |
11020 | |
11021 if (PyUnicode_READY(self) == -1) | |
11022 return NULL; | |
11023 length = PyUnicode_GET_LENGTH(self); | |
11024 kind = PyUnicode_KIND(self); | |
11025 data = PyUnicode_DATA(self); | |
11026 | |
11027 /* Shortcut for single character strings */ | |
11028 if (length == 1) | |
11029 return PyBool_FromLong( | |
11030 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); | |
11031 | |
11032 /* Special case for empty strings */ | |
11033 if (length == 0) | |
11034 return PyBool_FromLong(0); | |
11035 | |
11036 cased = 0; | |
11037 for (i = 0; i < length; i++) { | |
11038 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
11039 | |
11040 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) | |
11041 return PyBool_FromLong(0); | |
11042 else if (!cased && Py_UNICODE_ISUPPER(ch)) | |
11043 cased = 1; | |
11044 } | |
11045 return PyBool_FromLong(cased); | |
11046 } | |
11047 | |
11048 PyDoc_STRVAR(istitle__doc__, | |
11049 "S.istitle() -> bool\n\ | |
11050 \n\ | |
11051 Return True if S is a titlecased string and there is at least one\n\ | |
11052 character in S, i.e. upper- and titlecase characters may only\n\ | |
11053 follow uncased characters and lowercase characters only cased ones.\n\ | |
11054 Return False otherwise."); | |
11055 | |
11056 static PyObject* | |
11057 unicode_istitle(PyObject *self) | |
11058 { | |
11059 Py_ssize_t i, length; | |
11060 int kind; | |
11061 void *data; | |
11062 int cased, previous_is_cased; | |
11063 | |
11064 if (PyUnicode_READY(self) == -1) | |
11065 return NULL; | |
11066 length = PyUnicode_GET_LENGTH(self); | |
11067 kind = PyUnicode_KIND(self); | |
11068 data = PyUnicode_DATA(self); | |
11069 | |
11070 /* Shortcut for single character strings */ | |
11071 if (length == 1) { | |
11072 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); | |
11073 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || | |
11074 (Py_UNICODE_ISUPPER(ch) != 0)); | |
11075 } | |
11076 | |
11077 /* Special case for empty strings */ | |
11078 if (length == 0) | |
11079 return PyBool_FromLong(0); | |
11080 | |
11081 cased = 0; | |
11082 previous_is_cased = 0; | |
11083 for (i = 0; i < length; i++) { | |
11084 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
11085 | |
11086 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { | |
11087 if (previous_is_cased) | |
11088 return PyBool_FromLong(0); | |
11089 previous_is_cased = 1; | |
11090 cased = 1; | |
11091 } | |
11092 else if (Py_UNICODE_ISLOWER(ch)) { | |
11093 if (!previous_is_cased) | |
11094 return PyBool_FromLong(0); | |
11095 previous_is_cased = 1; | |
11096 cased = 1; | |
11097 } | |
11098 else | |
11099 previous_is_cased = 0; | |
11100 } | |
11101 return PyBool_FromLong(cased); | |
11102 } | |
11103 | |
11104 PyDoc_STRVAR(isspace__doc__, | |
11105 "S.isspace() -> bool\n\ | |
11106 \n\ | |
11107 Return True if all characters in S are whitespace\n\ | |
11108 and there is at least one character in S, False otherwise."); | |
11109 | |
11110 static PyObject* | |
11111 unicode_isspace(PyObject *self) | |
11112 { | |
11113 Py_ssize_t i, length; | |
11114 int kind; | |
11115 void *data; | |
11116 | |
11117 if (PyUnicode_READY(self) == -1) | |
11118 return NULL; | |
11119 length = PyUnicode_GET_LENGTH(self); | |
11120 kind = PyUnicode_KIND(self); | |
11121 data = PyUnicode_DATA(self); | |
11122 | |
11123 /* Shortcut for single character strings */ | |
11124 if (length == 1) | |
11125 return PyBool_FromLong( | |
11126 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); | |
11127 | |
11128 /* Special case for empty strings */ | |
11129 if (length == 0) | |
11130 return PyBool_FromLong(0); | |
11131 | |
11132 for (i = 0; i < length; i++) { | |
11133 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
11134 if (!Py_UNICODE_ISSPACE(ch)) | |
11135 return PyBool_FromLong(0); | |
11136 } | |
11137 return PyBool_FromLong(1); | |
11138 } | |
11139 | |
11140 PyDoc_STRVAR(isalpha__doc__, | |
11141 "S.isalpha() -> bool\n\ | |
11142 \n\ | |
11143 Return True if all characters in S are alphabetic\n\ | |
11144 and there is at least one character in S, False otherwise."); | |
11145 | |
11146 static PyObject* | |
11147 unicode_isalpha(PyObject *self) | |
11148 { | |
11149 Py_ssize_t i, length; | |
11150 int kind; | |
11151 void *data; | |
11152 | |
11153 if (PyUnicode_READY(self) == -1) | |
11154 return NULL; | |
11155 length = PyUnicode_GET_LENGTH(self); | |
11156 kind = PyUnicode_KIND(self); | |
11157 data = PyUnicode_DATA(self); | |
11158 | |
11159 /* Shortcut for single character strings */ | |
11160 if (length == 1) | |
11161 return PyBool_FromLong( | |
11162 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); | |
11163 | |
11164 /* Special case for empty strings */ | |
11165 if (length == 0) | |
11166 return PyBool_FromLong(0); | |
11167 | |
11168 for (i = 0; i < length; i++) { | |
11169 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) | |
11170 return PyBool_FromLong(0); | |
11171 } | |
11172 return PyBool_FromLong(1); | |
11173 } | |
11174 | |
11175 PyDoc_STRVAR(isalnum__doc__, | |
11176 "S.isalnum() -> bool\n\ | |
11177 \n\ | |
11178 Return True if all characters in S are alphanumeric\n\ | |
11179 and there is at least one character in S, False otherwise."); | |
11180 | |
11181 static PyObject* | |
11182 unicode_isalnum(PyObject *self) | |
11183 { | |
11184 int kind; | |
11185 void *data; | |
11186 Py_ssize_t len, i; | |
11187 | |
11188 if (PyUnicode_READY(self) == -1) | |
11189 return NULL; | |
11190 | |
11191 kind = PyUnicode_KIND(self); | |
11192 data = PyUnicode_DATA(self); | |
11193 len = PyUnicode_GET_LENGTH(self); | |
11194 | |
11195 /* Shortcut for single character strings */ | |
11196 if (len == 1) { | |
11197 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); | |
11198 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); | |
11199 } | |
11200 | |
11201 /* Special case for empty strings */ | |
11202 if (len == 0) | |
11203 return PyBool_FromLong(0); | |
11204 | |
11205 for (i = 0; i < len; i++) { | |
11206 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); | |
11207 if (!Py_UNICODE_ISALNUM(ch)) | |
11208 return PyBool_FromLong(0); | |
11209 } | |
11210 return PyBool_FromLong(1); | |
11211 } | |
11212 | |
11213 PyDoc_STRVAR(isdecimal__doc__, | |
11214 "S.isdecimal() -> bool\n\ | |
11215 \n\ | |
11216 Return True if there are only decimal characters in S,\n\ | |
11217 False otherwise."); | |
11218 | |
11219 static PyObject* | |
11220 unicode_isdecimal(PyObject *self) | |
11221 { | |
11222 Py_ssize_t i, length; | |
11223 int kind; | |
11224 void *data; | |
11225 | |
11226 if (PyUnicode_READY(self) == -1) | |
11227 return NULL; | |
11228 length = PyUnicode_GET_LENGTH(self); | |
11229 kind = PyUnicode_KIND(self); | |
11230 data = PyUnicode_DATA(self); | |
11231 | |
11232 /* Shortcut for single character strings */ | |
11233 if (length == 1) | |
11234 return PyBool_FromLong( | |
11235 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); | |
11236 | |
11237 /* Special case for empty strings */ | |
11238 if (length == 0) | |
11239 return PyBool_FromLong(0); | |
11240 | |
11241 for (i = 0; i < length; i++) { | |
11242 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) | |
11243 return PyBool_FromLong(0); | |
11244 } | |
11245 return PyBool_FromLong(1); | |
11246 } | |
11247 | |
11248 PyDoc_STRVAR(isdigit__doc__, | |
11249 "S.isdigit() -> bool\n\ | |
11250 \n\ | |
11251 Return True if all characters in S are digits\n\ | |
11252 and there is at least one character in S, False otherwise."); | |
11253 | |
11254 static PyObject* | |
11255 unicode_isdigit(PyObject *self) | |
11256 { | |
11257 Py_ssize_t i, length; | |
11258 int kind; | |
11259 void *data; | |
11260 | |
11261 if (PyUnicode_READY(self) == -1) | |
11262 return NULL; | |
11263 length = PyUnicode_GET_LENGTH(self); | |
11264 kind = PyUnicode_KIND(self); | |
11265 data = PyUnicode_DATA(self); | |
11266 | |
11267 /* Shortcut for single character strings */ | |
11268 if (length == 1) { | |
11269 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); | |
11270 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); | |
11271 } | |
11272 | |
11273 /* Special case for empty strings */ | |
11274 if (length == 0) | |
11275 return PyBool_FromLong(0); | |
11276 | |
11277 for (i = 0; i < length; i++) { | |
11278 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) | |
11279 return PyBool_FromLong(0); | |
11280 } | |
11281 return PyBool_FromLong(1); | |
11282 } | |
11283 | |
11284 PyDoc_STRVAR(isnumeric__doc__, | |
11285 "S.isnumeric() -> bool\n\ | |
11286 \n\ | |
11287 Return True if there are only numeric characters in S,\n\ | |
11288 False otherwise."); | |
11289 | |
11290 static PyObject* | |
11291 unicode_isnumeric(PyObject *self) | |
11292 { | |
11293 Py_ssize_t i, length; | |
11294 int kind; | |
11295 void *data; | |
11296 | |
11297 if (PyUnicode_READY(self) == -1) | |
11298 return NULL; | |
11299 length = PyUnicode_GET_LENGTH(self); | |
11300 kind = PyUnicode_KIND(self); | |
11301 data = PyUnicode_DATA(self); | |
11302 | |
11303 /* Shortcut for single character strings */ | |
11304 if (length == 1) | |
11305 return PyBool_FromLong( | |
11306 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); | |
11307 | |
11308 /* Special case for empty strings */ | |
11309 if (length == 0) | |
11310 return PyBool_FromLong(0); | |
11311 | |
11312 for (i = 0; i < length; i++) { | |
11313 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) | |
11314 return PyBool_FromLong(0); | |
11315 } | |
11316 return PyBool_FromLong(1); | |
11317 } | |
11318 | |
11319 int | |
11320 PyUnicode_IsIdentifier(PyObject *self) | |
11321 { | |
11322 int kind; | |
11323 void *data; | |
11324 Py_ssize_t i; | |
11325 Py_UCS4 first; | |
11326 | |
11327 if (PyUnicode_READY(self) == -1) { | |
11328 Py_FatalError("identifier not ready"); | |
11329 return 0; | |
11330 } | |
11331 | |
11332 /* Special case for empty strings */ | |
11333 if (PyUnicode_GET_LENGTH(self) == 0) | |
11334 return 0; | |
11335 kind = PyUnicode_KIND(self); | |
11336 data = PyUnicode_DATA(self); | |
11337 | |
11338 /* PEP 3131 says that the first character must be in | |
11339 XID_Start and subsequent characters in XID_Continue, | |
11340 and for the ASCII range, the 2.x rules apply (i.e | |
11341 start with letters and underscore, continue with | |
11342 letters, digits, underscore). However, given the current | |
11343 definition of XID_Start and XID_Continue, it is sufficient | |
11344 to check just for these, except that _ must be allowed | |
11345 as starting an identifier. */ | |
11346 first = PyUnicode_READ(kind, data, 0); | |
11347 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) | |
11348 return 0; | |
11349 | |
11350 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) | |
11351 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) | |
11352 return 0; | |
11353 return 1; | |
11354 } | |
11355 | |
11356 PyDoc_STRVAR(isidentifier__doc__, | |
11357 "S.isidentifier() -> bool\n\ | |
11358 \n\ | |
11359 Return True if S is a valid identifier according\n\ | |
11360 to the language definition."); | |
11361 | |
11362 static PyObject* | |
11363 unicode_isidentifier(PyObject *self) | |
11364 { | |
11365 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); | |
11366 } | |
11367 | |
11368 PyDoc_STRVAR(isprintable__doc__, | |
11369 "S.isprintable() -> bool\n\ | |
11370 \n\ | |
11371 Return True if all characters in S are considered\n\ | |
11372 printable in repr() or S is empty, False otherwise."); | |
11373 | |
11374 static PyObject* | |
11375 unicode_isprintable(PyObject *self) | |
11376 { | |
11377 Py_ssize_t i, length; | |
11378 int kind; | |
11379 void *data; | |
11380 | |
11381 if (PyUnicode_READY(self) == -1) | |
11382 return NULL; | |
11383 length = PyUnicode_GET_LENGTH(self); | |
11384 kind = PyUnicode_KIND(self); | |
11385 data = PyUnicode_DATA(self); | |
11386 | |
11387 /* Shortcut for single character strings */ | |
11388 if (length == 1) | |
11389 return PyBool_FromLong( | |
11390 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); | |
11391 | |
11392 for (i = 0; i < length; i++) { | |
11393 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { | |
11394 Py_RETURN_FALSE; | |
11395 } | |
11396 } | |
11397 Py_RETURN_TRUE; | |
11398 } | |
11399 | |
11400 PyDoc_STRVAR(join__doc__, | |
11401 "S.join(iterable) -> str\n\ | |
11402 \n\ | |
11403 Return a string which is the concatenation of the strings in the\n\ | |
11404 iterable. The separator between elements is S."); | |
11405 | |
11406 static PyObject* | |
11407 unicode_join(PyObject *self, PyObject *data) | |
11408 { | |
11409 return PyUnicode_Join(self, data); | |
11410 } | |
11411 | |
11412 static Py_ssize_t | |
11413 unicode_length(PyObject *self) | |
11414 { | |
11415 if (PyUnicode_READY(self) == -1) | |
11416 return -1; | |
11417 return PyUnicode_GET_LENGTH(self); | |
11418 } | |
11419 | |
11420 PyDoc_STRVAR(ljust__doc__, | |
11421 "S.ljust(width[, fillchar]) -> str\n\ | |
11422 \n\ | |
11423 Return S left-justified in a Unicode string of length width. Padding is\n\ | |
11424 done using the specified fill character (default is a space)."); | |
11425 | |
11426 static PyObject * | |
11427 unicode_ljust(PyObject *self, PyObject *args) | |
11428 { | |
11429 Py_ssize_t width; | |
11430 Py_UCS4 fillchar = ' '; | |
11431 | |
11432 if (PyUnicode_READY(self) == -1) | |
11433 return NULL; | |
11434 | |
11435 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) | |
11436 return NULL; | |
11437 | |
11438 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { | |
11439 Py_INCREF(self); | |
11440 return self; | |
11441 } | |
11442 | |
11443 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); | |
11444 } | |
11445 | |
11446 PyDoc_STRVAR(lower__doc__, | |
11447 "S.lower() -> str\n\ | |
11448 \n\ | |
11449 Return a copy of the string S converted to lowercase."); | |
11450 | |
11451 static PyObject* | |
11452 unicode_lower(PyObject *self) | |
11453 { | |
11454 return fixup(self, fixlower); | |
11455 } | |
11456 | |
11457 #define LEFTSTRIP 0 | |
11458 #define RIGHTSTRIP 1 | |
11459 #define BOTHSTRIP 2 | |
11460 | |
11461 /* Arrays indexed by above */ | |
11462 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; | |
11463 | |
11464 #define STRIPNAME(i) (stripformat[i]+3) | |
11465 | |
11466 /* externally visible for str.strip(unicode) */ | |
11467 PyObject * | |
11468 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) | |
11469 { | |
11470 void *data; | |
11471 int kind; | |
11472 Py_ssize_t i, j, len; | |
11473 BLOOM_MASK sepmask; | |
11474 | |
11475 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) | |
11476 return NULL; | |
11477 | |
11478 kind = PyUnicode_KIND(self); | |
11479 data = PyUnicode_DATA(self); | |
11480 len = PyUnicode_GET_LENGTH(self); | |
11481 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), | |
11482 PyUnicode_DATA(sepobj), | |
11483 PyUnicode_GET_LENGTH(sepobj)); | |
11484 | |
11485 i = 0; | |
11486 if (striptype != RIGHTSTRIP) { | |
11487 while (i < len && | |
11488 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { | |
11489 i++; | |
11490 } | |
11491 } | |
11492 | |
11493 j = len; | |
11494 if (striptype != LEFTSTRIP) { | |
11495 do { | |
11496 j--; | |
11497 } while (j >= i && | |
11498 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); | |
11499 j++; | |
11500 } | |
11501 | |
11502 return PyUnicode_Substring(self, i, j); | |
11503 } | |
11504 | |
11505 PyObject* | |
11506 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) | |
11507 { | |
11508 unsigned char *data; | |
11509 int kind; | |
11510 Py_ssize_t length; | |
11511 | |
11512 if (PyUnicode_READY(self) == -1) | |
11513 return NULL; | |
11514 | |
11515 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); | |
11516 | |
11517 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) | |
11518 { | |
11519 if (PyUnicode_CheckExact(self)) { | |
11520 Py_INCREF(self); | |
11521 return self; | |
11522 } | |
11523 else | |
11524 return PyUnicode_Copy(self); | |
11525 } | |
11526 | |
11527 length = end - start; | |
11528 if (length == 1) | |
11529 return unicode_getitem(self, start); | |
11530 | |
11531 if (start < 0 || end < 0) { | |
11532 PyErr_SetString(PyExc_IndexError, "string index out of range"); | |
11533 return NULL; | |
11534 } | |
11535 | |
11536 if (PyUnicode_IS_ASCII(self)) { | |
11537 kind = PyUnicode_KIND(self); | |
11538 data = PyUnicode_1BYTE_DATA(self); | |
11539 return unicode_fromascii(data + start, length); | |
11540 } | |
11541 else { | |
11542 kind = PyUnicode_KIND(self); | |
11543 data = PyUnicode_1BYTE_DATA(self); | |
11544 return PyUnicode_FromKindAndData(kind, | |
11545 data + kind * start, | |
11546 length); | |
11547 } | |
11548 } | |
11549 | |
11550 static PyObject * | |
11551 do_strip(PyObject *self, int striptype) | |
11552 { | |
11553 int kind; | |
11554 void *data; | |
11555 Py_ssize_t len, i, j; | |
11556 | |
11557 if (PyUnicode_READY(self) == -1) | |
11558 return NULL; | |
11559 | |
11560 kind = PyUnicode_KIND(self); | |
11561 data = PyUnicode_DATA(self); | |
11562 len = PyUnicode_GET_LENGTH(self); | |
11563 | |
11564 i = 0; | |
11565 if (striptype != RIGHTSTRIP) { | |
11566 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { | |
11567 i++; | |
11568 } | |
11569 } | |
11570 | |
11571 j = len; | |
11572 if (striptype != LEFTSTRIP) { | |
11573 do { | |
11574 j--; | |
11575 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); | |
11576 j++; | |
11577 } | |
11578 | |
11579 return PyUnicode_Substring(self, i, j); | |
11580 } | |
11581 | |
11582 | |
11583 static PyObject * | |
11584 do_argstrip(PyObject *self, int striptype, PyObject *args) | |
11585 { | |
11586 PyObject *sep = NULL; | |
11587 | |
11588 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) | |
11589 return NULL; | |
11590 | |
11591 if (sep != NULL && sep != Py_None) { | |
11592 if (PyUnicode_Check(sep)) | |
11593 return _PyUnicode_XStrip(self, striptype, sep); | |
11594 else { | |
11595 PyErr_Format(PyExc_TypeError, | |
11596 "%s arg must be None or str", | |
11597 STRIPNAME(striptype)); | |
11598 return NULL; | |
11599 } | |
11600 } | |
11601 | |
11602 return do_strip(self, striptype); | |
11603 } | |
11604 | |
11605 | |
11606 PyDoc_STRVAR(strip__doc__, | |
11607 "S.strip([chars]) -> str\n\ | |
11608 \n\ | |
11609 Return a copy of the string S with leading and trailing\n\ | |
11610 whitespace removed.\n\ | |
11611 If chars is given and not None, remove characters in chars instead."); | |
11612 | |
11613 static PyObject * | |
11614 unicode_strip(PyObject *self, PyObject *args) | |
11615 { | |
11616 if (PyTuple_GET_SIZE(args) == 0) | |
11617 return do_strip(self, BOTHSTRIP); /* Common case */ | |
11618 else | |
11619 return do_argstrip(self, BOTHSTRIP, args); | |
11620 } | |
11621 | |
11622 | |
11623 PyDoc_STRVAR(lstrip__doc__, | |
11624 "S.lstrip([chars]) -> str\n\ | |
11625 \n\ | |
11626 Return a copy of the string S with leading whitespace removed.\n\ | |
11627 If chars is given and not None, remove characters in chars instead."); | |
11628 | |
11629 static PyObject * | |
11630 unicode_lstrip(PyObject *self, PyObject *args) | |
11631 { | |
11632 if (PyTuple_GET_SIZE(args) == 0) | |
11633 return do_strip(self, LEFTSTRIP); /* Common case */ | |
11634 else | |
11635 return do_argstrip(self, LEFTSTRIP, args); | |
11636 } | |
11637 | |
11638 | |
11639 PyDoc_STRVAR(rstrip__doc__, | |
11640 "S.rstrip([chars]) -> str\n\ | |
11641 \n\ | |
11642 Return a copy of the string S with trailing whitespace removed.\n\ | |
11643 If chars is given and not None, remove characters in chars instead."); | |
11644 | |
11645 static PyObject * | |
11646 unicode_rstrip(PyObject *self, PyObject *args) | |
11647 { | |
11648 if (PyTuple_GET_SIZE(args) == 0) | |
11649 return do_strip(self, RIGHTSTRIP); /* Common case */ | |
11650 else | |
11651 return do_argstrip(self, RIGHTSTRIP, args); | |
11652 } | |
11653 | |
11654 | |
11655 static PyObject* | |
11656 unicode_repeat(PyObject *str, Py_ssize_t len) | |
11657 { | |
11658 PyObject *u; | |
11659 Py_ssize_t nchars, n; | |
11660 | |
11661 if (len < 1) { | |
11662 Py_INCREF(unicode_empty); | |
11663 return unicode_empty; | |
11664 } | |
11665 | |
11666 if (len == 1 && PyUnicode_CheckExact(str)) { | |
11667 /* no repeat, return original string */ | |
11668 Py_INCREF(str); | |
11669 return str; | |
11670 } | |
11671 | |
11672 if (PyUnicode_READY(str) == -1) | |
11673 return NULL; | |
11674 | |
11675 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { | |
11676 PyErr_SetString(PyExc_OverflowError, | |
11677 "repeated string is too long"); | |
11678 return NULL; | |
11679 } | |
11680 nchars = len * PyUnicode_GET_LENGTH(str); | |
11681 | |
11682 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); | |
11683 if (!u) | |
11684 return NULL; | |
11685 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); | |
11686 | |
11687 if (PyUnicode_GET_LENGTH(str) == 1) { | |
11688 const int kind = PyUnicode_KIND(str); | |
11689 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); | |
11690 void *to = PyUnicode_DATA(u); | |
11691 if (kind == PyUnicode_1BYTE_KIND) | |
11692 memset(to, (unsigned char)fill_char, len); | |
11693 else { | |
11694 for (n = 0; n < len; ++n) | |
11695 PyUnicode_WRITE(kind, to, n, fill_char); | |
11696 } | |
11697 } | |
11698 else { | |
11699 /* number of characters copied this far */ | |
11700 Py_ssize_t done = PyUnicode_GET_LENGTH(str); | |
11701 const Py_ssize_t char_size = PyUnicode_KIND(str); | |
11702 char *to = (char *) PyUnicode_DATA(u); | |
11703 Py_MEMCPY(to, PyUnicode_DATA(str), | |
11704 PyUnicode_GET_LENGTH(str) * char_size); | |
11705 while (done < nchars) { | |
11706 n = (done <= nchars-done) ? done : nchars-done; | |
11707 Py_MEMCPY(to + (done * char_size), to, n * char_size); | |
11708 done += n; | |
11709 } | |
11710 } | |
11711 | |
11712 assert(_PyUnicode_CheckConsistency(u, 1)); | |
11713 return u; | |
11714 } | |
11715 | |
11716 PyObject * | |
11717 PyUnicode_Replace(PyObject *obj, | |
11718 PyObject *subobj, | |
11719 PyObject *replobj, | |
11720 Py_ssize_t maxcount) | |
11721 { | |
11722 PyObject *self; | |
11723 PyObject *str1; | |
11724 PyObject *str2; | |
11725 PyObject *result; | |
11726 | |
11727 self = PyUnicode_FromObject(obj); | |
11728 if (self == NULL || PyUnicode_READY(self) == -1) | |
11729 return NULL; | |
11730 str1 = PyUnicode_FromObject(subobj); | |
11731 if (str1 == NULL || PyUnicode_READY(str1) == -1) { | |
11732 Py_DECREF(self); | |
11733 return NULL; | |
11734 } | |
11735 str2 = PyUnicode_FromObject(replobj); | |
11736 if (str2 == NULL || PyUnicode_READY(str2)) { | |
11737 Py_DECREF(self); | |
11738 Py_DECREF(str1); | |
11739 return NULL; | |
11740 } | |
11741 result = replace(self, str1, str2, maxcount); | |
11742 Py_DECREF(self); | |
11743 Py_DECREF(str1); | |
11744 Py_DECREF(str2); | |
11745 return result; | |
11746 } | |
11747 | |
11748 PyDoc_STRVAR(replace__doc__, | |
11749 "S.replace(old, new[, count]) -> str\n\ | |
11750 \n\ | |
11751 Return a copy of S with all occurrences of substring\n\ | |
11752 old replaced by new. If the optional argument count is\n\ | |
11753 given, only the first count occurrences are replaced."); | |
11754 | |
11755 static PyObject* | |
11756 unicode_replace(PyObject *self, PyObject *args) | |
11757 { | |
11758 PyObject *str1; | |
11759 PyObject *str2; | |
11760 Py_ssize_t maxcount = -1; | |
11761 PyObject *result; | |
11762 | |
11763 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) | |
11764 return NULL; | |
11765 if (!PyUnicode_READY(self) == -1) | |
11766 return NULL; | |
11767 str1 = PyUnicode_FromObject(str1); | |
11768 if (str1 == NULL || PyUnicode_READY(str1) == -1) | |
11769 return NULL; | |
11770 str2 = PyUnicode_FromObject(str2); | |
11771 if (str2 == NULL || PyUnicode_READY(str2) == -1) { | |
11772 Py_DECREF(str1); | |
11773 return NULL; | |
11774 } | |
11775 | |
11776 result = replace(self, str1, str2, maxcount); | |
11777 | |
11778 Py_DECREF(str1); | |
11779 Py_DECREF(str2); | |
11780 return result; | |
11781 } | |
11782 | |
11783 static PyObject * | |
11784 unicode_repr(PyObject *unicode) | |
11785 { | |
11786 PyObject *repr; | |
11787 Py_ssize_t isize; | |
11788 Py_ssize_t osize, squote, dquote, i, o; | |
11789 Py_UCS4 max, quote; | |
11790 int ikind, okind; | |
11791 void *idata, *odata; | |
11792 | |
11793 if (PyUnicode_READY(unicode) == -1) | |
11794 return NULL; | |
11795 | |
11796 isize = PyUnicode_GET_LENGTH(unicode); | |
11797 idata = PyUnicode_DATA(unicode); | |
11798 | |
11799 /* Compute length of output, quote characters, and | |
11800 maximum character */ | |
11801 osize = 2; /* quotes */ | |
11802 max = 127; | |
11803 squote = dquote = 0; | |
11804 ikind = PyUnicode_KIND(unicode); | |
11805 for (i = 0; i < isize; i++) { | |
11806 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); | |
11807 switch (ch) { | |
11808 case '\'': squote++; osize++; break; | |
11809 case '"': dquote++; osize++; break; | |
11810 case '\\': case '\t': case '\r': case '\n': | |
11811 osize += 2; break; | |
11812 default: | |
11813 /* Fast-path ASCII */ | |
11814 if (ch < ' ' || ch == 0x7f) | |
11815 osize += 4; /* \xHH */ | |
11816 else if (ch < 0x7f) | |
11817 osize++; | |
11818 else if (Py_UNICODE_ISPRINTABLE(ch)) { | |
11819 osize++; | |
11820 max = ch > max ? ch : max; | |
11821 } | |
11822 else if (ch < 0x100) | |
11823 osize += 4; /* \xHH */ | |
11824 else if (ch < 0x10000) | |
11825 osize += 6; /* \uHHHH */ | |
11826 else | |
11827 osize += 10; /* \uHHHHHHHH */ | |
11828 } | |
11829 } | |
11830 | |
11831 quote = '\''; | |
11832 if (squote) { | |
11833 if (dquote) | |
11834 /* Both squote and dquote present. Use squote, | |
11835 and escape them */ | |
11836 osize += squote; | |
11837 else | |
11838 quote = '"'; | |
11839 } | |
11840 | |
11841 repr = PyUnicode_New(osize, max); | |
11842 if (repr == NULL) | |
11843 return NULL; | |
11844 okind = PyUnicode_KIND(repr); | |
11845 odata = PyUnicode_DATA(repr); | |
11846 | |
11847 PyUnicode_WRITE(okind, odata, 0, quote); | |
11848 PyUnicode_WRITE(okind, odata, osize-1, quote); | |
11849 | |
11850 for (i = 0, o = 1; i < isize; i++) { | |
11851 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); | |
11852 | |
11853 /* Escape quotes and backslashes */ | |
11854 if ((ch == quote) || (ch == '\\')) { | |
11855 PyUnicode_WRITE(okind, odata, o++, '\\'); | |
11856 PyUnicode_WRITE(okind, odata, o++, ch); | |
11857 continue; | |
11858 } | |
11859 | |
11860 /* Map special whitespace to '\t', \n', '\r' */ | |
11861 if (ch == '\t') { | |
11862 PyUnicode_WRITE(okind, odata, o++, '\\'); | |
11863 PyUnicode_WRITE(okind, odata, o++, 't'); | |
11864 } | |
11865 else if (ch == '\n') { | |
11866 PyUnicode_WRITE(okind, odata, o++, '\\'); | |
11867 PyUnicode_WRITE(okind, odata, o++, 'n'); | |
11868 } | |
11869 else if (ch == '\r') { | |
11870 PyUnicode_WRITE(okind, odata, o++, '\\'); | |
11871 PyUnicode_WRITE(okind, odata, o++, 'r'); | |
11872 } | |
11873 | |
11874 /* Map non-printable US ASCII to '\xhh' */ | |
11875 else if (ch < ' ' || ch == 0x7F) { | |
11876 PyUnicode_WRITE(okind, odata, o++, '\\'); | |
11877 PyUnicode_WRITE(okind, odata, o++, 'x'); | |
11878 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); | |
11879 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); | |
11880 } | |
11881 | |
11882 /* Copy ASCII characters as-is */ | |
11883 else if (ch < 0x7F) { | |
11884 PyUnicode_WRITE(okind, odata, o++, ch); | |
11885 } | |
11886 | |
11887 /* Non-ASCII characters */ | |
11888 else { | |
11889 /* Map Unicode whitespace and control characters | |
11890 (categories Z* and C* except ASCII space) | |
11891 */ | |
11892 if (!Py_UNICODE_ISPRINTABLE(ch)) { | |
11893 /* Map 8-bit characters to '\xhh' */ | |
11894 if (ch <= 0xff) { | |
11895 PyUnicode_WRITE(okind, odata, o++, '\\'); | |
11896 PyUnicode_WRITE(okind, odata, o++, 'x'); | |
11897 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); | |
11898 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); | |
11899 } | |
11900 /* Map 21-bit characters to '\U00xxxxxx' */ | |
11901 else if (ch >= 0x10000) { | |
11902 PyUnicode_WRITE(okind, odata, o++, '\\'); | |
11903 PyUnicode_WRITE(okind, odata, o++, 'U'); | |
11904 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); | |
11905 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); | |
11906 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); | |
11907 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); | |
11908 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); | |
11909 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); | |
11910 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); | |
11911 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); | |
11912 } | |
11913 /* Map 16-bit characters to '\uxxxx' */ | |
11914 else { | |
11915 PyUnicode_WRITE(okind, odata, o++, '\\'); | |
11916 PyUnicode_WRITE(okind, odata, o++, 'u'); | |
11917 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); | |
11918 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); | |
11919 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); | |
11920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); | |
11921 } | |
11922 } | |
11923 /* Copy characters as-is */ | |
11924 else { | |
11925 PyUnicode_WRITE(okind, odata, o++, ch); | |
11926 } | |
11927 } | |
11928 } | |
11929 /* Closing quote already added at the beginning */ | |
11930 assert(_PyUnicode_CheckConsistency(repr, 1)); | |
11931 return repr; | |
11932 } | |
11933 | |
11934 PyDoc_STRVAR(rfind__doc__, | |
11935 "S.rfind(sub[, start[, end]]) -> int\n\ | |
11936 \n\ | |
11937 Return the highest index in S where substring sub is found,\n\ | |
11938 such that sub is contained within S[start:end]. Optional\n\ | |
11939 arguments start and end are interpreted as in slice notation.\n\ | |
11940 \n\ | |
11941 Return -1 on failure."); | |
11942 | |
11943 static PyObject * | |
11944 unicode_rfind(PyObject *self, PyObject *args) | |
11945 { | |
11946 PyObject *substring; | |
11947 Py_ssize_t start; | |
11948 Py_ssize_t end; | |
11949 Py_ssize_t result; | |
11950 | |
11951 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, | |
11952 &start, &end)) | |
11953 return NULL; | |
11954 | |
11955 if (PyUnicode_READY(self) == -1) | |
11956 return NULL; | |
11957 if (PyUnicode_READY(substring) == -1) | |
11958 return NULL; | |
11959 | |
11960 result = any_find_slice(-1, self, substring, start, end); | |
11961 | |
11962 Py_DECREF(substring); | |
11963 | |
11964 if (result == -2) | |
11965 return NULL; | |
11966 | |
11967 return PyLong_FromSsize_t(result); | |
11968 } | |
11969 | |
11970 PyDoc_STRVAR(rindex__doc__, | |
11971 "S.rindex(sub[, start[, end]]) -> int\n\ | |
11972 \n\ | |
11973 Like S.rfind() but raise ValueError when the substring is not found."); | |
11974 | |
11975 static PyObject * | |
11976 unicode_rindex(PyObject *self, PyObject *args) | |
11977 { | |
11978 PyObject *substring; | |
11979 Py_ssize_t start; | |
11980 Py_ssize_t end; | |
11981 Py_ssize_t result; | |
11982 | |
11983 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, | |
11984 &start, &end)) | |
11985 return NULL; | |
11986 | |
11987 if (PyUnicode_READY(self) == -1) | |
11988 return NULL; | |
11989 if (PyUnicode_READY(substring) == -1) | |
11990 return NULL; | |
11991 | |
11992 result = any_find_slice(-1, self, substring, start, end); | |
11993 | |
11994 Py_DECREF(substring); | |
11995 | |
11996 if (result == -2) | |
11997 return NULL; | |
11998 | |
11999 if (result < 0) { | |
12000 PyErr_SetString(PyExc_ValueError, "substring not found"); | |
12001 return NULL; | |
12002 } | |
12003 | |
12004 return PyLong_FromSsize_t(result); | |
12005 } | |
12006 | |
12007 PyDoc_STRVAR(rjust__doc__, | |
12008 "S.rjust(width[, fillchar]) -> str\n\ | |
12009 \n\ | |
12010 Return S right-justified in a string of length width. Padding is\n\ | |
12011 done using the specified fill character (default is a space)."); | |
12012 | |
12013 static PyObject * | |
12014 unicode_rjust(PyObject *self, PyObject *args) | |
12015 { | |
12016 Py_ssize_t width; | |
12017 Py_UCS4 fillchar = ' '; | |
12018 | |
12019 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) | |
12020 return NULL; | |
12021 | |
12022 if (PyUnicode_READY(self) == -1) | |
12023 return NULL; | |
12024 | |
12025 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { | |
12026 Py_INCREF(self); | |
12027 return self; | |
12028 } | |
12029 | |
12030 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); | |
12031 } | |
12032 | |
12033 PyObject * | |
12034 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) | |
12035 { | |
12036 PyObject *result; | |
12037 | |
12038 s = PyUnicode_FromObject(s); | |
12039 if (s == NULL) | |
12040 return NULL; | |
12041 if (sep != NULL) { | |
12042 sep = PyUnicode_FromObject(sep); | |
12043 if (sep == NULL) { | |
12044 Py_DECREF(s); | |
12045 return NULL; | |
12046 } | |
12047 } | |
12048 | |
12049 result = split(s, sep, maxsplit); | |
12050 | |
12051 Py_DECREF(s); | |
12052 Py_XDECREF(sep); | |
12053 return result; | |
12054 } | |
12055 | |
12056 PyDoc_STRVAR(split__doc__, | |
12057 "S.split([sep[, maxsplit]]) -> list of strings\n\ | |
12058 \n\ | |
12059 Return a list of the words in S, using sep as the\n\ | |
12060 delimiter string. If maxsplit is given, at most maxsplit\n\ | |
12061 splits are done. If sep is not specified or is None, any\n\ | |
12062 whitespace string is a separator and empty strings are\n\ | |
12063 removed from the result."); | |
12064 | |
12065 static PyObject* | |
12066 unicode_split(PyObject *self, PyObject *args) | |
12067 { | |
12068 PyObject *substring = Py_None; | |
12069 Py_ssize_t maxcount = -1; | |
12070 | |
12071 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) | |
12072 return NULL; | |
12073 | |
12074 if (substring == Py_None) | |
12075 return split(self, NULL, maxcount); | |
12076 else if (PyUnicode_Check(substring)) | |
12077 return split(self, substring, maxcount); | |
12078 else | |
12079 return PyUnicode_Split(self, substring, maxcount); | |
12080 } | |
12081 | |
12082 PyObject * | |
12083 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) | |
12084 { | |
12085 PyObject* str_obj; | |
12086 PyObject* sep_obj; | |
12087 PyObject* out; | |
12088 int kind1, kind2, kind; | |
12089 void *buf1 = NULL, *buf2 = NULL; | |
12090 Py_ssize_t len1, len2; | |
12091 | |
12092 str_obj = PyUnicode_FromObject(str_in); | |
12093 if (!str_obj || PyUnicode_READY(str_obj) == -1) | |
12094 return NULL; | |
12095 sep_obj = PyUnicode_FromObject(sep_in); | |
12096 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { | |
12097 Py_DECREF(str_obj); | |
12098 return NULL; | |
12099 } | |
12100 | |
12101 kind1 = PyUnicode_KIND(str_obj); | |
12102 kind2 = PyUnicode_KIND(sep_obj); | |
12103 kind = Py_MAX(kind1, kind2); | |
12104 buf1 = PyUnicode_DATA(str_obj); | |
12105 if (kind1 != kind) | |
12106 buf1 = _PyUnicode_AsKind(str_obj, kind); | |
12107 if (!buf1) | |
12108 goto onError; | |
12109 buf2 = PyUnicode_DATA(sep_obj); | |
12110 if (kind2 != kind) | |
12111 buf2 = _PyUnicode_AsKind(sep_obj, kind); | |
12112 if (!buf2) | |
12113 goto onError; | |
12114 len1 = PyUnicode_GET_LENGTH(str_obj); | |
12115 len2 = PyUnicode_GET_LENGTH(sep_obj); | |
12116 | |
12117 switch(PyUnicode_KIND(str_obj)) { | |
12118 case PyUnicode_1BYTE_KIND: | |
12119 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) | |
12120 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); | |
12121 else | |
12122 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); | |
12123 break; | |
12124 case PyUnicode_2BYTE_KIND: | |
12125 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); | |
12126 break; | |
12127 case PyUnicode_4BYTE_KIND: | |
12128 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); | |
12129 break; | |
12130 default: | |
12131 assert(0); | |
12132 out = 0; | |
12133 } | |
12134 | |
12135 Py_DECREF(sep_obj); | |
12136 Py_DECREF(str_obj); | |
12137 if (kind1 != kind) | |
12138 PyMem_Free(buf1); | |
12139 if (kind2 != kind) | |
12140 PyMem_Free(buf2); | |
12141 | |
12142 return out; | |
12143 onError: | |
12144 Py_DECREF(sep_obj); | |
12145 Py_DECREF(str_obj); | |
12146 if (kind1 != kind && buf1) | |
12147 PyMem_Free(buf1); | |
12148 if (kind2 != kind && buf2) | |
12149 PyMem_Free(buf2); | |
12150 return NULL; | |
12151 } | |
12152 | |
12153 | |
12154 PyObject * | |
12155 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) | |
12156 { | |
12157 PyObject* str_obj; | |
12158 PyObject* sep_obj; | |
12159 PyObject* out; | |
12160 int kind1, kind2, kind; | |
12161 void *buf1 = NULL, *buf2 = NULL; | |
12162 Py_ssize_t len1, len2; | |
12163 | |
12164 str_obj = PyUnicode_FromObject(str_in); | |
12165 if (!str_obj) | |
12166 return NULL; | |
12167 sep_obj = PyUnicode_FromObject(sep_in); | |
12168 if (!sep_obj) { | |
12169 Py_DECREF(str_obj); | |
12170 return NULL; | |
12171 } | |
12172 | |
12173 kind1 = PyUnicode_KIND(str_in); | |
12174 kind2 = PyUnicode_KIND(sep_obj); | |
12175 kind = Py_MAX(kind1, kind2); | |
12176 buf1 = PyUnicode_DATA(str_in); | |
12177 if (kind1 != kind) | |
12178 buf1 = _PyUnicode_AsKind(str_in, kind); | |
12179 if (!buf1) | |
12180 goto onError; | |
12181 buf2 = PyUnicode_DATA(sep_obj); | |
12182 if (kind2 != kind) | |
12183 buf2 = _PyUnicode_AsKind(sep_obj, kind); | |
12184 if (!buf2) | |
12185 goto onError; | |
12186 len1 = PyUnicode_GET_LENGTH(str_obj); | |
12187 len2 = PyUnicode_GET_LENGTH(sep_obj); | |
12188 | |
12189 switch(PyUnicode_KIND(str_in)) { | |
12190 case PyUnicode_1BYTE_KIND: | |
12191 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) | |
12192 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); | |
12193 else | |
12194 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); | |
12195 break; | |
12196 case PyUnicode_2BYTE_KIND: | |
12197 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); | |
12198 break; | |
12199 case PyUnicode_4BYTE_KIND: | |
12200 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); | |
12201 break; | |
12202 default: | |
12203 assert(0); | |
12204 out = 0; | |
12205 } | |
12206 | |
12207 Py_DECREF(sep_obj); | |
12208 Py_DECREF(str_obj); | |
12209 if (kind1 != kind) | |
12210 PyMem_Free(buf1); | |
12211 if (kind2 != kind) | |
12212 PyMem_Free(buf2); | |
12213 | |
12214 return out; | |
12215 onError: | |
12216 Py_DECREF(sep_obj); | |
12217 Py_DECREF(str_obj); | |
12218 if (kind1 != kind && buf1) | |
12219 PyMem_Free(buf1); | |
12220 if (kind2 != kind && buf2) | |
12221 PyMem_Free(buf2); | |
12222 return NULL; | |
12223 } | |
12224 | |
12225 PyDoc_STRVAR(partition__doc__, | |
12226 "S.partition(sep) -> (head, sep, tail)\n\ | |
12227 \n\ | |
12228 Search for the separator sep in S, and return the part before it,\n\ | |
12229 the separator itself, and the part after it. If the separator is not\n\ | |
12230 found, return S and two empty strings."); | |
12231 | |
12232 static PyObject* | |
12233 unicode_partition(PyObject *self, PyObject *separator) | |
12234 { | |
12235 return PyUnicode_Partition(self, separator); | |
12236 } | |
12237 | |
12238 PyDoc_STRVAR(rpartition__doc__, | |
12239 "S.rpartition(sep) -> (head, sep, tail)\n\ | |
12240 \n\ | |
12241 Search for the separator sep in S, starting at the end of S, and return\n\ | |
12242 the part before it, the separator itself, and the part after it. If the\n\ | |
12243 separator is not found, return two empty strings and S."); | |
12244 | |
12245 static PyObject* | |
12246 unicode_rpartition(PyObject *self, PyObject *separator) | |
12247 { | |
12248 return PyUnicode_RPartition(self, separator); | |
12249 } | |
12250 | |
12251 PyObject * | |
12252 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) | |
12253 { | |
12254 PyObject *result; | |
12255 | |
12256 s = PyUnicode_FromObject(s); | |
12257 if (s == NULL) | |
12258 return NULL; | |
12259 if (sep != NULL) { | |
12260 sep = PyUnicode_FromObject(sep); | |
12261 if (sep == NULL) { | |
12262 Py_DECREF(s); | |
12263 return NULL; | |
12264 } | |
12265 } | |
12266 | |
12267 result = rsplit(s, sep, maxsplit); | |
12268 | |
12269 Py_DECREF(s); | |
12270 Py_XDECREF(sep); | |
12271 return result; | |
12272 } | |
12273 | |
12274 PyDoc_STRVAR(rsplit__doc__, | |
12275 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ | |
12276 \n\ | |
12277 Return a list of the words in S, using sep as the\n\ | |
12278 delimiter string, starting at the end of the string and\n\ | |
12279 working to the front. If maxsplit is given, at most maxsplit\n\ | |
12280 splits are done. If sep is not specified, any whitespace string\n\ | |
12281 is a separator."); | |
12282 | |
12283 static PyObject* | |
12284 unicode_rsplit(PyObject *self, PyObject *args) | |
12285 { | |
12286 PyObject *substring = Py_None; | |
12287 Py_ssize_t maxcount = -1; | |
12288 | |
12289 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) | |
12290 return NULL; | |
12291 | |
12292 if (substring == Py_None) | |
12293 return rsplit(self, NULL, maxcount); | |
12294 else if (PyUnicode_Check(substring)) | |
12295 return rsplit(self, substring, maxcount); | |
12296 else | |
12297 return PyUnicode_RSplit(self, substring, maxcount); | |
12298 } | |
12299 | |
12300 PyDoc_STRVAR(splitlines__doc__, | |
12301 "S.splitlines([keepends]) -> list of strings\n\ | |
12302 \n\ | |
12303 Return a list of the lines in S, breaking at line boundaries.\n\ | |
12304 Line breaks are not included in the resulting list unless keepends\n\ | |
12305 is given and true."); | |
12306 | |
12307 static PyObject* | |
12308 unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) | |
12309 { | |
12310 static char *kwlist[] = {"keepends", 0}; | |
12311 int keepends = 0; | |
12312 | |
12313 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", | |
12314 kwlist, &keepends)) | |
12315 return NULL; | |
12316 | |
12317 return PyUnicode_Splitlines(self, keepends); | |
12318 } | |
12319 | |
12320 static | |
12321 PyObject *unicode_str(PyObject *self) | |
12322 { | |
12323 if (PyUnicode_CheckExact(self)) { | |
12324 Py_INCREF(self); | |
12325 return self; | |
12326 } else | |
12327 /* Subtype -- return genuine unicode string with the same value. */ | |
12328 return PyUnicode_Copy(self); | |
12329 } | |
12330 | |
12331 PyDoc_STRVAR(swapcase__doc__, | |
12332 "S.swapcase() -> str\n\ | |
12333 \n\ | |
12334 Return a copy of S with uppercase characters converted to lowercase\n\ | |
12335 and vice versa."); | |
12336 | |
12337 static PyObject* | |
12338 unicode_swapcase(PyObject *self) | |
12339 { | |
12340 return fixup(self, fixswapcase); | |
12341 } | |
12342 | |
12343 PyDoc_STRVAR(maketrans__doc__, | |
12344 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ | |
12345 \n\ | |
12346 Return a translation table usable for str.translate().\n\ | |
12347 If there is only one argument, it must be a dictionary mapping Unicode\n\ | |
12348 ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ | |
12349 Character keys will be then converted to ordinals.\n\ | |
12350 If there are two arguments, they must be strings of equal length, and\n\ | |
12351 in the resulting dictionary, each character in x will be mapped to the\n\ | |
12352 character at the same position in y. If there is a third argument, it\n\ | |
12353 must be a string, whose characters will be mapped to None in the result."); | |
12354 | |
12355 static PyObject* | |
12356 unicode_maketrans(PyObject *null, PyObject *args) | |
12357 { | |
12358 PyObject *x, *y = NULL, *z = NULL; | |
12359 PyObject *new = NULL, *key, *value; | |
12360 Py_ssize_t i = 0; | |
12361 int res; | |
12362 | |
12363 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) | |
12364 return NULL; | |
12365 new = PyDict_New(); | |
12366 if (!new) | |
12367 return NULL; | |
12368 if (y != NULL) { | |
12369 int x_kind, y_kind, z_kind; | |
12370 void *x_data, *y_data, *z_data; | |
12371 | |
12372 /* x must be a string too, of equal length */ | |
12373 if (!PyUnicode_Check(x)) { | |
12374 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " | |
12375 "be a string if there is a second argument"); | |
12376 goto err; | |
12377 } | |
12378 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { | |
12379 PyErr_SetString(PyExc_ValueError, "the first two maketrans " | |
12380 "arguments must have equal length"); | |
12381 goto err; | |
12382 } | |
12383 /* create entries for translating chars in x to those in y */ | |
12384 x_kind = PyUnicode_KIND(x); | |
12385 y_kind = PyUnicode_KIND(y); | |
12386 x_data = PyUnicode_DATA(x); | |
12387 y_data = PyUnicode_DATA(y); | |
12388 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { | |
12389 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); | |
12390 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); | |
12391 if (!key || !value) | |
12392 goto err; | |
12393 res = PyDict_SetItem(new, key, value); | |
12394 Py_DECREF(key); | |
12395 Py_DECREF(value); | |
12396 if (res < 0) | |
12397 goto err; | |
12398 } | |
12399 /* create entries for deleting chars in z */ | |
12400 if (z != NULL) { | |
12401 z_kind = PyUnicode_KIND(z); | |
12402 z_data = PyUnicode_DATA(z); | |
12403 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { | |
12404 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); | |
12405 if (!key) | |
12406 goto err; | |
12407 res = PyDict_SetItem(new, key, Py_None); | |
12408 Py_DECREF(key); | |
12409 if (res < 0) | |
12410 goto err; | |
12411 } | |
12412 } | |
12413 } else { | |
12414 int kind; | |
12415 void *data; | |
12416 | |
12417 /* x must be a dict */ | |
12418 if (!PyDict_CheckExact(x)) { | |
12419 PyErr_SetString(PyExc_TypeError, "if you give only one argument " | |
12420 "to maketrans it must be a dict"); | |
12421 goto err; | |
12422 } | |
12423 /* copy entries into the new dict, converting string keys to int keys */ | |
12424 while (PyDict_Next(x, &i, &key, &value)) { | |
12425 if (PyUnicode_Check(key)) { | |
12426 /* convert string keys to integer keys */ | |
12427 PyObject *newkey; | |
12428 if (PyUnicode_GET_LENGTH(key) != 1) { | |
12429 PyErr_SetString(PyExc_ValueError, "string keys in translate " | |
12430 "table must be of length 1"); | |
12431 goto err; | |
12432 } | |
12433 kind = PyUnicode_KIND(key); | |
12434 data = PyUnicode_DATA(key); | |
12435 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); | |
12436 if (!newkey) | |
12437 goto err; | |
12438 res = PyDict_SetItem(new, newkey, value); | |
12439 Py_DECREF(newkey); | |
12440 if (res < 0) | |
12441 goto err; | |
12442 } else if (PyLong_Check(key)) { | |
12443 /* just keep integer keys */ | |
12444 if (PyDict_SetItem(new, key, value) < 0) | |
12445 goto err; | |
12446 } else { | |
12447 PyErr_SetString(PyExc_TypeError, "keys in translate table must " | |
12448 "be strings or integers"); | |
12449 goto err; | |
12450 } | |
12451 } | |
12452 } | |
12453 return new; | |
12454 err: | |
12455 Py_DECREF(new); | |
12456 return NULL; | |
12457 } | |
12458 | |
12459 PyDoc_STRVAR(translate__doc__, | |
12460 "S.translate(table) -> str\n\ | |
12461 \n\ | |
12462 Return a copy of the string S, where all characters have been mapped\n\ | |
12463 through the given translation table, which must be a mapping of\n\ | |
12464 Unicode ordinals to Unicode ordinals, strings, or None.\n\ | |
12465 Unmapped characters are left untouched. Characters mapped to None\n\ | |
12466 are deleted."); | |
12467 | |
12468 static PyObject* | |
12469 unicode_translate(PyObject *self, PyObject *table) | |
12470 { | |
12471 return _PyUnicode_TranslateCharmap(self, table, "ignore"); | |
12472 } | |
12473 | |
12474 PyDoc_STRVAR(upper__doc__, | |
12475 "S.upper() -> str\n\ | |
12476 \n\ | |
12477 Return a copy of S converted to uppercase."); | |
12478 | |
12479 static PyObject* | |
12480 unicode_upper(PyObject *self) | |
12481 { | |
12482 return fixup(self, fixupper); | |
12483 } | |
12484 | |
12485 PyDoc_STRVAR(zfill__doc__, | |
12486 "S.zfill(width) -> str\n\ | |
12487 \n\ | |
12488 Pad a numeric string S with zeros on the left, to fill a field\n\ | |
12489 of the specified width. The string S is never truncated."); | |
12490 | |
12491 static PyObject * | |
12492 unicode_zfill(PyObject *self, PyObject *args) | |
12493 { | |
12494 Py_ssize_t fill; | |
12495 PyObject *u; | |
12496 Py_ssize_t width; | |
12497 int kind; | |
12498 void *data; | |
12499 Py_UCS4 chr; | |
12500 | |
12501 if (PyUnicode_READY(self) == -1) | |
12502 return NULL; | |
12503 | |
12504 if (!PyArg_ParseTuple(args, "n:zfill", &width)) | |
12505 return NULL; | |
12506 | |
12507 if (PyUnicode_GET_LENGTH(self) >= width) { | |
12508 if (PyUnicode_CheckExact(self)) { | |
12509 Py_INCREF(self); | |
12510 return self; | |
12511 } | |
12512 else | |
12513 return PyUnicode_Copy(self); | |
12514 } | |
12515 | |
12516 fill = width - _PyUnicode_LENGTH(self); | |
12517 | |
12518 u = pad(self, fill, 0, '0'); | |
12519 | |
12520 if (u == NULL) | |
12521 return NULL; | |
12522 | |
12523 kind = PyUnicode_KIND(u); | |
12524 data = PyUnicode_DATA(u); | |
12525 chr = PyUnicode_READ(kind, data, fill); | |
12526 | |
12527 if (chr == '+' || chr == '-') { | |
12528 /* move sign to beginning of string */ | |
12529 PyUnicode_WRITE(kind, data, 0, chr); | |
12530 PyUnicode_WRITE(kind, data, fill, '0'); | |
12531 } | |
12532 | |
12533 assert(_PyUnicode_CheckConsistency(u, 1)); | |
12534 return u; | |
12535 } | |
12536 | |
12537 #if 0 | |
12538 static PyObject * | |
12539 unicode__decimal2ascii(PyObject *self) | |
12540 { | |
12541 return PyUnicode_TransformDecimalAndSpaceToASCII(self); | |
12542 } | |
12543 #endif | |
12544 | |
12545 PyDoc_STRVAR(startswith__doc__, | |
12546 "S.startswith(prefix[, start[, end]]) -> bool\n\ | |
12547 \n\ | |
12548 Return True if S starts with the specified prefix, False otherwise.\n\ | |
12549 With optional start, test S beginning at that position.\n\ | |
12550 With optional end, stop comparing S at that position.\n\ | |
12551 prefix can also be a tuple of strings to try."); | |
12552 | |
12553 static PyObject * | |
12554 unicode_startswith(PyObject *self, | |
12555 PyObject *args) | |
12556 { | |
12557 PyObject *subobj; | |
12558 PyObject *substring; | |
12559 Py_ssize_t start = 0; | |
12560 Py_ssize_t end = PY_SSIZE_T_MAX; | |
12561 int result; | |
12562 | |
12563 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) | |
12564 return NULL; | |
12565 if (PyTuple_Check(subobj)) { | |
12566 Py_ssize_t i; | |
12567 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { | |
12568 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); | |
12569 if (substring == NULL) | |
12570 return NULL; | |
12571 result = tailmatch(self, substring, start, end, -1); | |
12572 Py_DECREF(substring); | |
12573 if (result) { | |
12574 Py_RETURN_TRUE; | |
12575 } | |
12576 } | |
12577 /* nothing matched */ | |
12578 Py_RETURN_FALSE; | |
12579 } | |
12580 substring = PyUnicode_FromObject(subobj); | |
12581 if (substring == NULL) { | |
12582 if (PyErr_ExceptionMatches(PyExc_TypeError)) | |
12583 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " | |
12584 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); | |
12585 return NULL; | |
12586 } | |
12587 result = tailmatch(self, substring, start, end, -1); | |
12588 Py_DECREF(substring); | |
12589 return PyBool_FromLong(result); | |
12590 } | |
12591 | |
12592 | |
12593 PyDoc_STRVAR(endswith__doc__, | |
12594 "S.endswith(suffix[, start[, end]]) -> bool\n\ | |
12595 \n\ | |
12596 Return True if S ends with the specified suffix, False otherwise.\n\ | |
12597 With optional start, test S beginning at that position.\n\ | |
12598 With optional end, stop comparing S at that position.\n\ | |
12599 suffix can also be a tuple of strings to try."); | |
12600 | |
12601 static PyObject * | |
12602 unicode_endswith(PyObject *self, | |
12603 PyObject *args) | |
12604 { | |
12605 PyObject *subobj; | |
12606 PyObject *substring; | |
12607 Py_ssize_t start = 0; | |
12608 Py_ssize_t end = PY_SSIZE_T_MAX; | |
12609 int result; | |
12610 | |
12611 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) | |
12612 return NULL; | |
12613 if (PyTuple_Check(subobj)) { | |
12614 Py_ssize_t i; | |
12615 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { | |
12616 substring = PyUnicode_FromObject( | |
12617 PyTuple_GET_ITEM(subobj, i)); | |
12618 if (substring == NULL) | |
12619 return NULL; | |
12620 result = tailmatch(self, substring, start, end, +1); | |
12621 Py_DECREF(substring); | |
12622 if (result) { | |
12623 Py_RETURN_TRUE; | |
12624 } | |
12625 } | |
12626 Py_RETURN_FALSE; | |
12627 } | |
12628 substring = PyUnicode_FromObject(subobj); | |
12629 if (substring == NULL) { | |
12630 if (PyErr_ExceptionMatches(PyExc_TypeError)) | |
12631 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " | |
12632 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); | |
12633 return NULL; | |
12634 } | |
12635 result = tailmatch(self, substring, start, end, +1); | |
12636 Py_DECREF(substring); | |
12637 return PyBool_FromLong(result); | |
12638 } | |
12639 | |
12640 #include "stringlib/unicode_format.h" | |
12641 | |
12642 PyDoc_STRVAR(format__doc__, | |
12643 "S.format(*args, **kwargs) -> str\n\ | |
12644 \n\ | |
12645 Return a formatted version of S, using substitutions from args and kwargs.\n\ | |
12646 The substitutions are identified by braces ('{' and '}')."); | |
12647 | |
12648 PyDoc_STRVAR(format_map__doc__, | |
12649 "S.format_map(mapping) -> str\n\ | |
12650 \n\ | |
12651 Return a formatted version of S, using substitutions from mapping.\n\ | |
12652 The substitutions are identified by braces ('{' and '}')."); | |
12653 | |
12654 static PyObject * | |
12655 unicode__format__(PyObject* self, PyObject* args) | |
12656 { | |
12657 PyObject *format_spec, *out; | |
12658 | |
12659 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) | |
12660 return NULL; | |
12661 | |
12662 out = _PyUnicode_FormatAdvanced(self, format_spec, 0, | |
12663 PyUnicode_GET_LENGTH(format_spec)); | |
12664 return out; | |
12665 } | |
12666 | |
12667 PyDoc_STRVAR(p_format__doc__, | |
12668 "S.__format__(format_spec) -> str\n\ | |
12669 \n\ | |
12670 Return a formatted version of S as described by format_spec."); | |
12671 | |
12672 static PyObject * | |
12673 unicode__sizeof__(PyObject *v) | |
12674 { | |
12675 Py_ssize_t size; | |
12676 | |
12677 /* If it's a compact object, account for base structure + | |
12678 character data. */ | |
12679 if (PyUnicode_IS_COMPACT_ASCII(v)) | |
12680 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; | |
12681 else if (PyUnicode_IS_COMPACT(v)) | |
12682 size = sizeof(PyCompactUnicodeObject) + | |
12683 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); | |
12684 else { | |
12685 /* If it is a two-block object, account for base object, and | |
12686 for character block if present. */ | |
12687 size = sizeof(PyUnicodeObject); | |
12688 if (_PyUnicode_DATA_ANY(v)) | |
12689 size += (PyUnicode_GET_LENGTH(v) + 1) * | |
12690 PyUnicode_KIND(v); | |
12691 } | |
12692 /* If the wstr pointer is present, account for it unless it is shared | |
12693 with the data pointer. Check if the data is not shared. */ | |
12694 if (_PyUnicode_HAS_WSTR_MEMORY(v)) | |
12695 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); | |
12696 if (_PyUnicode_HAS_UTF8_MEMORY(v)) | |
12697 size += PyUnicode_UTF8_LENGTH(v) + 1; | |
12698 | |
12699 return PyLong_FromSsize_t(size); | |
12700 } | |
12701 | |
12702 PyDoc_STRVAR(sizeof__doc__, | |
12703 "S.__sizeof__() -> size of S in memory, in bytes"); | |
12704 | |
12705 static PyObject * | |
12706 unicode_getnewargs(PyObject *v) | |
12707 { | |
12708 PyObject *copy = PyUnicode_Copy(v); | |
12709 if (!copy) | |
12710 return NULL; | |
12711 return Py_BuildValue("(N)", copy); | |
12712 } | |
12713 | |
12714 static PyMethodDef unicode_methods[] = { | |
12715 | |
12716 /* Order is according to common usage: often used methods should | |
12717 appear first, since lookup is done sequentially. */ | |
12718 | |
12719 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, | |
12720 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, | |
12721 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, | |
12722 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, | |
12723 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, | |
12724 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, | |
12725 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, | |
12726 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, | |
12727 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, | |
12728 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, | |
12729 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, | |
12730 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, | |
12731 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, | |
12732 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, | |
12733 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, | |
12734 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, | |
12735 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, | |
12736 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, | |
12737 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, | |
12738 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, | |
12739 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, | |
12740 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, | |
12741 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, | |
12742 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, | |
12743 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, | |
12744 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, | |
12745 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, | |
12746 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, | |
12747 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, | |
12748 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, | |
12749 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, | |
12750 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, | |
12751 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, | |
12752 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, | |
12753 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, | |
12754 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, | |
12755 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, | |
12756 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, | |
12757 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, | |
12758 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, | |
12759 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, | |
12760 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, | |
12761 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, | |
12762 {"maketrans", (PyCFunction) unicode_maketrans, | |
12763 METH_VARARGS | METH_STATIC, maketrans__doc__}, | |
12764 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, | |
12765 #if 0 | |
12766 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, | |
12767 #endif | |
12768 | |
12769 #if 0 | |
12770 /* These methods are just used for debugging the implementation. */ | |
12771 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, | |
12772 #endif | |
12773 | |
12774 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, | |
12775 {NULL, NULL} | |
12776 }; | |
12777 | |
12778 static PyObject * | |
12779 unicode_mod(PyObject *v, PyObject *w) | |
12780 { | |
12781 if (!PyUnicode_Check(v)) | |
12782 Py_RETURN_NOTIMPLEMENTED; | |
12783 return PyUnicode_Format(v, w); | |
12784 } | |
12785 | |
12786 static PyNumberMethods unicode_as_number = { | |
12787 0, /*nb_add*/ | |
12788 0, /*nb_subtract*/ | |
12789 0, /*nb_multiply*/ | |
12790 unicode_mod, /*nb_remainder*/ | |
12791 }; | |
12792 | |
12793 static PySequenceMethods unicode_as_sequence = { | |
12794 (lenfunc) unicode_length, /* sq_length */ | |
12795 PyUnicode_Concat, /* sq_concat */ | |
12796 (ssizeargfunc) unicode_repeat, /* sq_repeat */ | |
12797 (ssizeargfunc) unicode_getitem, /* sq_item */ | |
12798 0, /* sq_slice */ | |
12799 0, /* sq_ass_item */ | |
12800 0, /* sq_ass_slice */ | |
12801 PyUnicode_Contains, /* sq_contains */ | |
12802 }; | |
12803 | |
12804 static PyObject* | |
12805 unicode_subscript(PyObject* self, PyObject* item) | |
12806 { | |
12807 if (PyUnicode_READY(self) == -1) | |
12808 return NULL; | |
12809 | |
12810 if (PyIndex_Check(item)) { | |
12811 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); | |
12812 if (i == -1 && PyErr_Occurred()) | |
12813 return NULL; | |
12814 if (i < 0) | |
12815 i += PyUnicode_GET_LENGTH(self); | |
12816 return unicode_getitem(self, i); | |
12817 } else if (PySlice_Check(item)) { | |
12818 Py_ssize_t start, stop, step, slicelength, cur, i; | |
12819 PyObject *result; | |
12820 void *src_data, *dest_data; | |
12821 int src_kind, dest_kind; | |
12822 Py_UCS4 ch, max_char, kind_limit; | |
12823 | |
12824 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), | |
12825 &start, &stop, &step, &slicelength) < 0) { | |
12826 return NULL; | |
12827 } | |
12828 | |
12829 if (slicelength <= 0) { | |
12830 return PyUnicode_New(0, 0); | |
12831 } else if (start == 0 && step == 1 && | |
12832 slicelength == PyUnicode_GET_LENGTH(self) && | |
12833 PyUnicode_CheckExact(self)) { | |
12834 Py_INCREF(self); | |
12835 return self; | |
12836 } else if (step == 1) { | |
12837 return PyUnicode_Substring(self, | |
12838 start, start + slicelength); | |
12839 } | |
12840 /* General case */ | |
12841 src_kind = PyUnicode_KIND(self); | |
12842 src_data = PyUnicode_DATA(self); | |
12843 if (!PyUnicode_IS_ASCII(self)) { | |
12844 kind_limit = kind_maxchar_limit(src_kind); | |
12845 max_char = 0; | |
12846 for (cur = start, i = 0; i < slicelength; cur += step, i++) { | |
12847 ch = PyUnicode_READ(src_kind, src_data, cur); | |
12848 if (ch > max_char) { | |
12849 max_char = ch; | |
12850 if (max_char >= kind_limit) | |
12851 break; | |
12852 } | |
12853 } | |
12854 } | |
12855 else | |
12856 max_char = 127; | |
12857 result = PyUnicode_New(slicelength, max_char); | |
12858 if (result == NULL) | |
12859 return NULL; | |
12860 dest_kind = PyUnicode_KIND(result); | |
12861 dest_data = PyUnicode_DATA(result); | |
12862 | |
12863 for (cur = start, i = 0; i < slicelength; cur += step, i++) { | |
12864 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); | |
12865 PyUnicode_WRITE(dest_kind, dest_data, i, ch); | |
12866 } | |
12867 assert(_PyUnicode_CheckConsistency(result, 1)); | |
12868 return result; | |
12869 } else { | |
12870 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); | |
12871 return NULL; | |
12872 } | |
12873 } | |
12874 | |
12875 static PyMappingMethods unicode_as_mapping = { | |
12876 (lenfunc)unicode_length, /* mp_length */ | |
12877 (binaryfunc)unicode_subscript, /* mp_subscript */ | |
12878 (objobjargproc)0, /* mp_ass_subscript */ | |
12879 }; | |
12880 | |
12881 | |
12882 /* Helpers for PyUnicode_Format() */ | |
12883 | |
12884 static PyObject * | |
12885 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) | |
12886 { | |
12887 Py_ssize_t argidx = *p_argidx; | |
12888 if (argidx < arglen) { | |
12889 (*p_argidx)++; | |
12890 if (arglen < 0) | |
12891 return args; | |
12892 else | |
12893 return PyTuple_GetItem(args, argidx); | |
12894 } | |
12895 PyErr_SetString(PyExc_TypeError, | |
12896 "not enough arguments for format string"); | |
12897 return NULL; | |
12898 } | |
12899 | |
12900 /* Returns a new reference to a PyUnicode object, or NULL on failure. */ | |
12901 | |
12902 static PyObject * | |
12903 formatfloat(PyObject *v, int flags, int prec, int type) | |
12904 { | |
12905 char *p; | |
12906 PyObject *result; | |
12907 double x; | |
12908 | |
12909 x = PyFloat_AsDouble(v); | |
12910 if (x == -1.0 && PyErr_Occurred()) | |
12911 return NULL; | |
12912 | |
12913 if (prec < 0) | |
12914 prec = 6; | |
12915 | |
12916 p = PyOS_double_to_string(x, type, prec, | |
12917 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); | |
12918 if (p == NULL) | |
12919 return NULL; | |
12920 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); | |
12921 PyMem_Free(p); | |
12922 return result; | |
12923 } | |
12924 | |
12925 static PyObject* | |
12926 formatlong(PyObject *val, int flags, int prec, int type) | |
12927 { | |
12928 char *buf; | |
12929 int len; | |
12930 PyObject *str; /* temporary string object. */ | |
12931 PyObject *result; | |
12932 | |
12933 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); | |
12934 if (!str) | |
12935 return NULL; | |
12936 result = PyUnicode_DecodeASCII(buf, len, NULL); | |
12937 Py_DECREF(str); | |
12938 return result; | |
12939 } | |
12940 | |
12941 static Py_UCS4 | |
12942 formatchar(PyObject *v) | |
12943 { | |
12944 /* presume that the buffer is at least 3 characters long */ | |
12945 if (PyUnicode_Check(v)) { | |
12946 if (PyUnicode_GET_LENGTH(v) == 1) { | |
12947 return PyUnicode_READ_CHAR(v, 0); | |
12948 } | |
12949 goto onError; | |
12950 } | |
12951 else { | |
12952 /* Integer input truncated to a character */ | |
12953 long x; | |
12954 x = PyLong_AsLong(v); | |
12955 if (x == -1 && PyErr_Occurred()) | |
12956 goto onError; | |
12957 | |
12958 if (x < 0 || x > MAX_UNICODE) { | |
12959 PyErr_SetString(PyExc_OverflowError, | |
12960 "%c arg not in range(0x110000)"); | |
12961 return (Py_UCS4) -1; | |
12962 } | |
12963 | |
12964 return (Py_UCS4) x; | |
12965 } | |
12966 | |
12967 onError: | |
12968 PyErr_SetString(PyExc_TypeError, | |
12969 "%c requires int or char"); | |
12970 return (Py_UCS4) -1; | |
12971 } | |
12972 | |
12973 static int | |
12974 repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count) | |
12975 { | |
12976 int r; | |
12977 assert(count > 0); | |
12978 assert(PyUnicode_Check(obj)); | |
12979 if (count > 5) { | |
12980 PyObject *repeated = unicode_repeat(obj, count); | |
12981 if (repeated == NULL) | |
12982 return -1; | |
12983 r = _PyAccu_Accumulate(acc, repeated); | |
12984 Py_DECREF(repeated); | |
12985 return r; | |
12986 } | |
12987 else { | |
12988 do { | |
12989 if (_PyAccu_Accumulate(acc, obj)) | |
12990 return -1; | |
12991 } while (--count); | |
12992 return 0; | |
12993 } | |
12994 } | |
12995 | |
12996 PyObject * | |
12997 PyUnicode_Format(PyObject *format, PyObject *args) | |
12998 { | |
12999 void *fmt; | |
13000 int fmtkind; | |
13001 PyObject *result; | |
13002 int kind; | |
13003 int r; | |
13004 Py_ssize_t fmtcnt, fmtpos, arglen, argidx; | |
13005 int args_owned = 0; | |
13006 PyObject *dict = NULL; | |
13007 PyObject *temp = NULL; | |
13008 PyObject *second = NULL; | |
13009 PyObject *uformat; | |
13010 _PyAccu acc; | |
13011 static PyObject *plus, *minus, *blank, *zero, *percent; | |
13012 | |
13013 if (!plus && !(plus = get_latin1_char('+'))) | |
13014 return NULL; | |
13015 if (!minus && !(minus = get_latin1_char('-'))) | |
13016 return NULL; | |
13017 if (!blank && !(blank = get_latin1_char(' '))) | |
13018 return NULL; | |
13019 if (!zero && !(zero = get_latin1_char('0'))) | |
13020 return NULL; | |
13021 if (!percent && !(percent = get_latin1_char('%'))) | |
13022 return NULL; | |
13023 | |
13024 if (format == NULL || args == NULL) { | |
13025 PyErr_BadInternalCall(); | |
13026 return NULL; | |
13027 } | |
13028 uformat = PyUnicode_FromObject(format); | |
13029 if (uformat == NULL || PyUnicode_READY(uformat) == -1) | |
13030 return NULL; | |
13031 if (_PyAccu_Init(&acc)) | |
13032 goto onError; | |
13033 fmt = PyUnicode_DATA(uformat); | |
13034 fmtkind = PyUnicode_KIND(uformat); | |
13035 fmtcnt = PyUnicode_GET_LENGTH(uformat); | |
13036 fmtpos = 0; | |
13037 | |
13038 if (PyTuple_Check(args)) { | |
13039 arglen = PyTuple_Size(args); | |
13040 argidx = 0; | |
13041 } | |
13042 else { | |
13043 arglen = -1; | |
13044 argidx = -2; | |
13045 } | |
13046 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && | |
13047 !PyUnicode_Check(args)) | |
13048 dict = args; | |
13049 | |
13050 while (--fmtcnt >= 0) { | |
13051 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { | |
13052 PyObject *nonfmt; | |
13053 Py_ssize_t nonfmtpos; | |
13054 nonfmtpos = fmtpos++; | |
13055 while (fmtcnt >= 0 && | |
13056 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { | |
13057 fmtpos++; | |
13058 fmtcnt--; | |
13059 } | |
13060 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos); | |
13061 if (nonfmt == NULL) | |
13062 goto onError; | |
13063 r = _PyAccu_Accumulate(&acc, nonfmt); | |
13064 Py_DECREF(nonfmt); | |
13065 if (r) | |
13066 goto onError; | |
13067 } | |
13068 else { | |
13069 /* Got a format specifier */ | |
13070 int flags = 0; | |
13071 Py_ssize_t width = -1; | |
13072 int prec = -1; | |
13073 Py_UCS4 c = '\0'; | |
13074 Py_UCS4 fill, sign; | |
13075 int isnumok; | |
13076 PyObject *v = NULL; | |
13077 void *pbuf = NULL; | |
13078 Py_ssize_t pindex, len; | |
13079 PyObject *signobj = NULL, *fillobj = NULL; | |
13080 | |
13081 fmtpos++; | |
13082 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { | |
13083 Py_ssize_t keystart; | |
13084 Py_ssize_t keylen; | |
13085 PyObject *key; | |
13086 int pcount = 1; | |
13087 | |
13088 if (dict == NULL) { | |
13089 PyErr_SetString(PyExc_TypeError, | |
13090 "format requires a mapping"); | |
13091 goto onError; | |
13092 } | |
13093 ++fmtpos; | |
13094 --fmtcnt; | |
13095 keystart = fmtpos; | |
13096 /* Skip over balanced parentheses */ | |
13097 while (pcount > 0 && --fmtcnt >= 0) { | |
13098 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') | |
13099 --pcount; | |
13100 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') | |
13101 ++pcount; | |
13102 fmtpos++; | |
13103 } | |
13104 keylen = fmtpos - keystart - 1; | |
13105 if (fmtcnt < 0 || pcount > 0) { | |
13106 PyErr_SetString(PyExc_ValueError, | |
13107 "incomplete format key"); | |
13108 goto onError; | |
13109 } | |
13110 key = PyUnicode_Substring(uformat, | |
13111 keystart, keystart + keylen); | |
13112 if (key == NULL) | |
13113 goto onError; | |
13114 if (args_owned) { | |
13115 Py_DECREF(args); | |
13116 args_owned = 0; | |
13117 } | |
13118 args = PyObject_GetItem(dict, key); | |
13119 Py_DECREF(key); | |
13120 if (args == NULL) { | |
13121 goto onError; | |
13122 } | |
13123 args_owned = 1; | |
13124 arglen = -1; | |
13125 argidx = -2; | |
13126 } | |
13127 while (--fmtcnt >= 0) { | |
13128 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { | |
13129 case '-': flags |= F_LJUST; continue; | |
13130 case '+': flags |= F_SIGN; continue; | |
13131 case ' ': flags |= F_BLANK; continue; | |
13132 case '#': flags |= F_ALT; continue; | |
13133 case '0': flags |= F_ZERO; continue; | |
13134 } | |
13135 break; | |
13136 } | |
13137 if (c == '*') { | |
13138 v = getnextarg(args, arglen, &argidx); | |
13139 if (v == NULL) | |
13140 goto onError; | |
13141 if (!PyLong_Check(v)) { | |
13142 PyErr_SetString(PyExc_TypeError, | |
13143 "* wants int"); | |
13144 goto onError; | |
13145 } | |
13146 width = PyLong_AsLong(v); | |
13147 if (width == -1 && PyErr_Occurred()) | |
13148 goto onError; | |
13149 if (width < 0) { | |
13150 flags |= F_LJUST; | |
13151 width = -width; | |
13152 } | |
13153 if (--fmtcnt >= 0) | |
13154 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); | |
13155 } | |
13156 else if (c >= '0' && c <= '9') { | |
13157 width = c - '0'; | |
13158 while (--fmtcnt >= 0) { | |
13159 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); | |
13160 if (c < '0' || c > '9') | |
13161 break; | |
13162 if ((width*10) / 10 != width) { | |
13163 PyErr_SetString(PyExc_ValueError, | |
13164 "width too big"); | |
13165 goto onError; | |
13166 } | |
13167 width = width*10 + (c - '0'); | |
13168 } | |
13169 } | |
13170 if (c == '.') { | |
13171 prec = 0; | |
13172 if (--fmtcnt >= 0) | |
13173 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); | |
13174 if (c == '*') { | |
13175 v = getnextarg(args, arglen, &argidx); | |
13176 if (v == NULL) | |
13177 goto onError; | |
13178 if (!PyLong_Check(v)) { | |
13179 PyErr_SetString(PyExc_TypeError, | |
13180 "* wants int"); | |
13181 goto onError; | |
13182 } | |
13183 prec = PyLong_AsLong(v); | |
13184 if (prec == -1 && PyErr_Occurred()) | |
13185 goto onError; | |
13186 if (prec < 0) | |
13187 prec = 0; | |
13188 if (--fmtcnt >= 0) | |
13189 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); | |
13190 } | |
13191 else if (c >= '0' && c <= '9') { | |
13192 prec = c - '0'; | |
13193 while (--fmtcnt >= 0) { | |
13194 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); | |
13195 if (c < '0' || c > '9') | |
13196 break; | |
13197 if ((prec*10) / 10 != prec) { | |
13198 PyErr_SetString(PyExc_ValueError, | |
13199 "prec too big"); | |
13200 goto onError; | |
13201 } | |
13202 prec = prec*10 + (c - '0'); | |
13203 } | |
13204 } | |
13205 } /* prec */ | |
13206 if (fmtcnt >= 0) { | |
13207 if (c == 'h' || c == 'l' || c == 'L') { | |
13208 if (--fmtcnt >= 0) | |
13209 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); | |
13210 } | |
13211 } | |
13212 if (fmtcnt < 0) { | |
13213 PyErr_SetString(PyExc_ValueError, | |
13214 "incomplete format"); | |
13215 goto onError; | |
13216 } | |
13217 if (c != '%') { | |
13218 v = getnextarg(args, arglen, &argidx); | |
13219 if (v == NULL) | |
13220 goto onError; | |
13221 } | |
13222 sign = 0; | |
13223 fill = ' '; | |
13224 fillobj = blank; | |
13225 switch (c) { | |
13226 | |
13227 case '%': | |
13228 _PyAccu_Accumulate(&acc, percent); | |
13229 continue; | |
13230 | |
13231 case 's': | |
13232 case 'r': | |
13233 case 'a': | |
13234 if (PyUnicode_CheckExact(v) && c == 's') { | |
13235 temp = v; | |
13236 Py_INCREF(temp); | |
13237 } | |
13238 else { | |
13239 if (c == 's') | |
13240 temp = PyObject_Str(v); | |
13241 else if (c == 'r') | |
13242 temp = PyObject_Repr(v); | |
13243 else | |
13244 temp = PyObject_ASCII(v); | |
13245 if (temp == NULL) | |
13246 goto onError; | |
13247 if (PyUnicode_Check(temp)) | |
13248 /* nothing to do */; | |
13249 else { | |
13250 Py_DECREF(temp); | |
13251 PyErr_SetString(PyExc_TypeError, | |
13252 "%s argument has non-string str()"); | |
13253 goto onError; | |
13254 } | |
13255 } | |
13256 if (PyUnicode_READY(temp) == -1) { | |
13257 Py_CLEAR(temp); | |
13258 goto onError; | |
13259 } | |
13260 pbuf = PyUnicode_DATA(temp); | |
13261 kind = PyUnicode_KIND(temp); | |
13262 len = PyUnicode_GET_LENGTH(temp); | |
13263 if (prec >= 0 && len > prec) | |
13264 len = prec; | |
13265 break; | |
13266 | |
13267 case 'i': | |
13268 case 'd': | |
13269 case 'u': | |
13270 case 'o': | |
13271 case 'x': | |
13272 case 'X': | |
13273 isnumok = 0; | |
13274 if (PyNumber_Check(v)) { | |
13275 PyObject *iobj=NULL; | |
13276 | |
13277 if (PyLong_Check(v)) { | |
13278 iobj = v; | |
13279 Py_INCREF(iobj); | |
13280 } | |
13281 else { | |
13282 iobj = PyNumber_Long(v); | |
13283 } | |
13284 if (iobj!=NULL) { | |
13285 if (PyLong_Check(iobj)) { | |
13286 isnumok = 1; | |
13287 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); | |
13288 Py_DECREF(iobj); | |
13289 if (!temp) | |
13290 goto onError; | |
13291 if (PyUnicode_READY(temp) == -1) { | |
13292 Py_CLEAR(temp); | |
13293 goto onError; | |
13294 } | |
13295 pbuf = PyUnicode_DATA(temp); | |
13296 kind = PyUnicode_KIND(temp); | |
13297 len = PyUnicode_GET_LENGTH(temp); | |
13298 sign = 1; | |
13299 } | |
13300 else { | |
13301 Py_DECREF(iobj); | |
13302 } | |
13303 } | |
13304 } | |
13305 if (!isnumok) { | |
13306 PyErr_Format(PyExc_TypeError, | |
13307 "%%%c format: a number is required, " | |
13308 "not %.200s", (char)c, Py_TYPE(v)->tp_name); | |
13309 goto onError; | |
13310 } | |
13311 if (flags & F_ZERO) { | |
13312 fill = '0'; | |
13313 fillobj = zero; | |
13314 } | |
13315 break; | |
13316 | |
13317 case 'e': | |
13318 case 'E': | |
13319 case 'f': | |
13320 case 'F': | |
13321 case 'g': | |
13322 case 'G': | |
13323 temp = formatfloat(v, flags, prec, c); | |
13324 if (!temp) | |
13325 goto onError; | |
13326 if (PyUnicode_READY(temp) == -1) { | |
13327 Py_CLEAR(temp); | |
13328 goto onError; | |
13329 } | |
13330 pbuf = PyUnicode_DATA(temp); | |
13331 kind = PyUnicode_KIND(temp); | |
13332 len = PyUnicode_GET_LENGTH(temp); | |
13333 sign = 1; | |
13334 if (flags & F_ZERO) { | |
13335 fill = '0'; | |
13336 fillobj = zero; | |
13337 } | |
13338 break; | |
13339 | |
13340 case 'c': | |
13341 { | |
13342 Py_UCS4 ch = formatchar(v); | |
13343 if (ch == (Py_UCS4) -1) | |
13344 goto onError; | |
13345 temp = _PyUnicode_FromUCS4(&ch, 1); | |
13346 if (temp == NULL) | |
13347 goto onError; | |
13348 pbuf = PyUnicode_DATA(temp); | |
13349 kind = PyUnicode_KIND(temp); | |
13350 len = PyUnicode_GET_LENGTH(temp); | |
13351 break; | |
13352 } | |
13353 | |
13354 default: | |
13355 PyErr_Format(PyExc_ValueError, | |
13356 "unsupported format character '%c' (0x%x) " | |
13357 "at index %zd", | |
13358 (31<=c && c<=126) ? (char)c : '?', | |
13359 (int)c, | |
13360 fmtpos - 1); | |
13361 goto onError; | |
13362 } | |
13363 /* pbuf is initialized here. */ | |
13364 pindex = 0; | |
13365 if (sign) { | |
13366 if (PyUnicode_READ(kind, pbuf, pindex) == '-') { | |
13367 signobj = minus; | |
13368 len--; | |
13369 pindex++; | |
13370 } | |
13371 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') { | |
13372 signobj = plus; | |
13373 len--; | |
13374 pindex++; | |
13375 } | |
13376 else if (flags & F_SIGN) | |
13377 signobj = plus; | |
13378 else if (flags & F_BLANK) | |
13379 signobj = blank; | |
13380 else | |
13381 sign = 0; | |
13382 } | |
13383 if (width < len) | |
13384 width = len; | |
13385 if (sign) { | |
13386 if (fill != ' ') { | |
13387 assert(signobj != NULL); | |
13388 if (_PyAccu_Accumulate(&acc, signobj)) | |
13389 goto onError; | |
13390 } | |
13391 if (width > len) | |
13392 width--; | |
13393 } | |
13394 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { | |
13395 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); | |
13396 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); | |
13397 if (fill != ' ') { | |
13398 second = get_latin1_char( | |
13399 PyUnicode_READ(kind, pbuf, pindex + 1)); | |
13400 pindex += 2; | |
13401 if (second == NULL || | |
13402 _PyAccu_Accumulate(&acc, zero) || | |
13403 _PyAccu_Accumulate(&acc, second)) | |
13404 goto onError; | |
13405 Py_CLEAR(second); | |
13406 } | |
13407 width -= 2; | |
13408 if (width < 0) | |
13409 width = 0; | |
13410 len -= 2; | |
13411 } | |
13412 if (width > len && !(flags & F_LJUST)) { | |
13413 assert(fillobj != NULL); | |
13414 if (repeat_accumulate(&acc, fillobj, width - len)) | |
13415 goto onError; | |
13416 width = len; | |
13417 } | |
13418 if (fill == ' ') { | |
13419 if (sign) { | |
13420 assert(signobj != NULL); | |
13421 if (_PyAccu_Accumulate(&acc, signobj)) | |
13422 goto onError; | |
13423 } | |
13424 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { | |
13425 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); | |
13426 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); | |
13427 second = get_latin1_char( | |
13428 PyUnicode_READ(kind, pbuf, pindex + 1)); | |
13429 pindex += 2; | |
13430 if (second == NULL || | |
13431 _PyAccu_Accumulate(&acc, zero) || | |
13432 _PyAccu_Accumulate(&acc, second)) | |
13433 goto onError; | |
13434 Py_CLEAR(second); | |
13435 } | |
13436 } | |
13437 /* Copy all characters, preserving len */ | |
13438 if (temp != NULL) { | |
13439 assert(pbuf == PyUnicode_DATA(temp)); | |
13440 v = PyUnicode_Substring(temp, pindex, pindex + len); | |
13441 } | |
13442 else { | |
13443 const char *p = (const char *) pbuf; | |
13444 assert(pbuf != NULL); | |
13445 p += kind * pindex; | |
13446 v = PyUnicode_FromKindAndData(kind, p, len); | |
13447 } | |
13448 if (v == NULL) | |
13449 goto onError; | |
13450 r = _PyAccu_Accumulate(&acc, v); | |
13451 Py_DECREF(v); | |
13452 if (r) | |
13453 goto onError; | |
13454 if (width > len && repeat_accumulate(&acc, blank, width - len)) | |
13455 goto onError; | |
13456 if (dict && (argidx < arglen) && c != '%') { | |
13457 PyErr_SetString(PyExc_TypeError, | |
13458 "not all arguments converted during string formatting"); | |
13459 goto onError; | |
13460 } | |
13461 Py_CLEAR(temp); | |
13462 } /* '%' */ | |
13463 } /* until end */ | |
13464 if (argidx < arglen && !dict) { | |
13465 PyErr_SetString(PyExc_TypeError, | |
13466 "not all arguments converted during string formatting"); | |
13467 goto onError; | |
13468 } | |
13469 | |
13470 result = _PyAccu_Finish(&acc); | |
13471 if (args_owned) { | |
13472 Py_DECREF(args); | |
13473 } | |
13474 Py_DECREF(uformat); | |
13475 Py_XDECREF(temp); | |
13476 Py_XDECREF(second); | |
13477 return result; | |
13478 | |
13479 onError: | |
13480 Py_DECREF(uformat); | |
13481 Py_XDECREF(temp); | |
13482 Py_XDECREF(second); | |
13483 _PyAccu_Destroy(&acc); | |
13484 if (args_owned) { | |
13485 Py_DECREF(args); | |
13486 } | |
13487 return NULL; | |
13488 } | |
13489 | |
13490 static PyObject * | |
13491 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); | |
13492 | |
13493 static PyObject * | |
13494 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) | |
13495 { | |
13496 PyObject *x = NULL; | |
13497 static char *kwlist[] = {"object", "encoding", "errors", 0}; | |
13498 char *encoding = NULL; | |
13499 char *errors = NULL; | |
13500 | |
13501 if (type != &PyUnicode_Type) | |
13502 return unicode_subtype_new(type, args, kwds); | |
13503 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", | |
13504 kwlist, &x, &encoding, &errors)) | |
13505 return NULL; | |
13506 if (x == NULL) | |
13507 return PyUnicode_New(0, 0); | |
13508 if (encoding == NULL && errors == NULL) | |
13509 return PyObject_Str(x); | |
13510 else | |
13511 return PyUnicode_FromEncodedObject(x, encoding, errors); | |
13512 } | |
13513 | |
13514 static PyObject * | |
13515 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) | |
13516 { | |
13517 PyObject *unicode, *self; | |
13518 Py_ssize_t length, char_size; | |
13519 int share_wstr, share_utf8; | |
13520 unsigned int kind; | |
13521 void *data; | |
13522 | |
13523 assert(PyType_IsSubtype(type, &PyUnicode_Type)); | |
13524 | |
13525 unicode = unicode_new(&PyUnicode_Type, args, kwds); | |
13526 if (unicode == NULL) | |
13527 return NULL; | |
13528 assert(_PyUnicode_CHECK(unicode)); | |
13529 if (PyUnicode_READY(unicode)) | |
13530 return NULL; | |
13531 | |
13532 self = type->tp_alloc(type, 0); | |
13533 if (self == NULL) { | |
13534 Py_DECREF(unicode); | |
13535 return NULL; | |
13536 } | |
13537 kind = PyUnicode_KIND(unicode); | |
13538 length = PyUnicode_GET_LENGTH(unicode); | |
13539 | |
13540 _PyUnicode_LENGTH(self) = length; | |
13541 #ifdef Py_DEBUG | |
13542 _PyUnicode_HASH(self) = -1; | |
13543 #else | |
13544 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); | |
13545 #endif | |
13546 _PyUnicode_STATE(self).interned = 0; | |
13547 _PyUnicode_STATE(self).kind = kind; | |
13548 _PyUnicode_STATE(self).compact = 0; | |
13549 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; | |
13550 _PyUnicode_STATE(self).ready = 1; | |
13551 _PyUnicode_WSTR(self) = NULL; | |
13552 _PyUnicode_UTF8_LENGTH(self) = 0; | |
13553 _PyUnicode_UTF8(self) = NULL; | |
13554 _PyUnicode_WSTR_LENGTH(self) = 0; | |
13555 _PyUnicode_DATA_ANY(self) = NULL; | |
13556 | |
13557 share_utf8 = 0; | |
13558 share_wstr = 0; | |
13559 if (kind == PyUnicode_1BYTE_KIND) { | |
13560 char_size = 1; | |
13561 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) | |
13562 share_utf8 = 1; | |
13563 } | |
13564 else if (kind == PyUnicode_2BYTE_KIND) { | |
13565 char_size = 2; | |
13566 if (sizeof(wchar_t) == 2) | |
13567 share_wstr = 1; | |
13568 } | |
13569 else { | |
13570 assert(kind == PyUnicode_4BYTE_KIND); | |
13571 char_size = 4; | |
13572 if (sizeof(wchar_t) == 4) | |
13573 share_wstr = 1; | |
13574 } | |
13575 | |
13576 /* Ensure we won't overflow the length. */ | |
13577 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { | |
13578 PyErr_NoMemory(); | |
13579 goto onError; | |
13580 } | |
13581 data = PyObject_MALLOC((length + 1) * char_size); | |
13582 if (data == NULL) { | |
13583 PyErr_NoMemory(); | |
13584 goto onError; | |
13585 } | |
13586 | |
13587 _PyUnicode_DATA_ANY(self) = data; | |
13588 if (share_utf8) { | |
13589 _PyUnicode_UTF8_LENGTH(self) = length; | |
13590 _PyUnicode_UTF8(self) = data; | |
13591 } | |
13592 if (share_wstr) { | |
13593 _PyUnicode_WSTR_LENGTH(self) = length; | |
13594 _PyUnicode_WSTR(self) = (wchar_t *)data; | |
13595 } | |
13596 | |
13597 Py_MEMCPY(data, PyUnicode_DATA(unicode), | |
13598 kind * (length + 1)); | |
13599 assert(_PyUnicode_CheckConsistency(self, 1)); | |
13600 #ifdef Py_DEBUG | |
13601 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); | |
13602 #endif | |
13603 Py_DECREF(unicode); | |
13604 return self; | |
13605 | |
13606 onError: | |
13607 Py_DECREF(unicode); | |
13608 Py_DECREF(self); | |
13609 return NULL; | |
13610 } | |
13611 | |
13612 PyDoc_STRVAR(unicode_doc, | |
13613 "str(string[, encoding[, errors]]) -> str\n\ | |
13614 \n\ | |
13615 Create a new string object from the given encoded string.\n\ | |
13616 encoding defaults to the current default string encoding.\n\ | |
13617 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); | |
13618 | |
13619 static PyObject *unicode_iter(PyObject *seq); | |
13620 | |
13621 PyTypeObject PyUnicode_Type = { | |
13622 PyVarObject_HEAD_INIT(&PyType_Type, 0) | |
13623 "str", /* tp_name */ | |
13624 sizeof(PyUnicodeObject), /* tp_size */ | |
13625 0, /* tp_itemsize */ | |
13626 /* Slots */ | |
13627 (destructor)unicode_dealloc, /* tp_dealloc */ | |
13628 0, /* tp_print */ | |
13629 0, /* tp_getattr */ | |
13630 0, /* tp_setattr */ | |
13631 0, /* tp_reserved */ | |
13632 unicode_repr, /* tp_repr */ | |
13633 &unicode_as_number, /* tp_as_number */ | |
13634 &unicode_as_sequence, /* tp_as_sequence */ | |
13635 &unicode_as_mapping, /* tp_as_mapping */ | |
13636 (hashfunc) unicode_hash, /* tp_hash*/ | |
13637 0, /* tp_call*/ | |
13638 (reprfunc) unicode_str, /* tp_str */ | |
13639 PyObject_GenericGetAttr, /* tp_getattro */ | |
13640 0, /* tp_setattro */ | |
13641 0, /* tp_as_buffer */ | |
13642 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | | |
13643 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ | |
13644 unicode_doc, /* tp_doc */ | |
13645 0, /* tp_traverse */ | |
13646 0, /* tp_clear */ | |
13647 PyUnicode_RichCompare, /* tp_richcompare */ | |
13648 0, /* tp_weaklistoffset */ | |
13649 unicode_iter, /* tp_iter */ | |
13650 0, /* tp_iternext */ | |
13651 unicode_methods, /* tp_methods */ | |
13652 0, /* tp_members */ | |
13653 0, /* tp_getset */ | |
13654 &PyBaseObject_Type, /* tp_base */ | |
13655 0, /* tp_dict */ | |
13656 0, /* tp_descr_get */ | |
13657 0, /* tp_descr_set */ | |
13658 0, /* tp_dictoffset */ | |
13659 0, /* tp_init */ | |
13660 0, /* tp_alloc */ | |
13661 unicode_new, /* tp_new */ | |
13662 PyObject_Del, /* tp_free */ | |
13663 }; | |
13664 | |
13665 /* Initialize the Unicode implementation */ | |
13666 | |
13667 int _PyUnicode_Init(void) | |
13668 { | |
13669 int i; | |
13670 | |
13671 /* XXX - move this array to unicodectype.c ? */ | |
13672 Py_UCS2 linebreak[] = { | |
13673 0x000A, /* LINE FEED */ | |
13674 0x000D, /* CARRIAGE RETURN */ | |
13675 0x001C, /* FILE SEPARATOR */ | |
13676 0x001D, /* GROUP SEPARATOR */ | |
13677 0x001E, /* RECORD SEPARATOR */ | |
13678 0x0085, /* NEXT LINE */ | |
13679 0x2028, /* LINE SEPARATOR */ | |
13680 0x2029, /* PARAGRAPH SEPARATOR */ | |
13681 }; | |
13682 | |
13683 /* Init the implementation */ | |
13684 unicode_empty = PyUnicode_New(0, 0); | |
13685 if (!unicode_empty) | |
13686 Py_FatalError("Can't create empty string"); | |
13687 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); | |
13688 | |
13689 for (i = 0; i < 256; i++) | |
13690 unicode_latin1[i] = NULL; | |
13691 if (PyType_Ready(&PyUnicode_Type) < 0) | |
13692 Py_FatalError("Can't initialize 'unicode'"); | |
13693 | |
13694 /* initialize the linebreak bloom filter */ | |
13695 bloom_linebreak = make_bloom_mask( | |
13696 PyUnicode_2BYTE_KIND, linebreak, | |
13697 Py_ARRAY_LENGTH(linebreak)); | |
13698 | |
13699 PyType_Ready(&EncodingMapType); | |
13700 | |
13701 #ifdef HAVE_MBCS | |
13702 winver.dwOSVersionInfoSize = sizeof(winver); | |
13703 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { | |
13704 PyErr_SetFromWindowsErr(0); | |
13705 return -1; | |
13706 } | |
13707 #endif | |
13708 return 0; | |
13709 } | |
13710 | |
13711 /* Finalize the Unicode implementation */ | |
13712 | |
13713 int | |
13714 PyUnicode_ClearFreeList(void) | |
13715 { | |
13716 return 0; | |
13717 } | |
13718 | |
13719 void | |
13720 _PyUnicode_Fini(void) | |
13721 { | |
13722 int i; | |
13723 | |
13724 Py_XDECREF(unicode_empty); | |
13725 unicode_empty = NULL; | |
13726 | |
13727 for (i = 0; i < 256; i++) { | |
13728 if (unicode_latin1[i]) { | |
13729 Py_DECREF(unicode_latin1[i]); | |
13730 unicode_latin1[i] = NULL; | |
13731 } | |
13732 } | |
13733 _PyUnicode_ClearStaticStrings(); | |
13734 (void)PyUnicode_ClearFreeList(); | |
13735 } | |
13736 | |
13737 void | |
13738 PyUnicode_InternInPlace(PyObject **p) | |
13739 { | |
13740 register PyObject *s = *p; | |
13741 PyObject *t; | |
13742 #ifdef Py_DEBUG | |
13743 assert(s != NULL); | |
13744 assert(_PyUnicode_CHECK(s)); | |
13745 #else | |
13746 if (s == NULL || !PyUnicode_Check(s)) | |
13747 return; | |
13748 #endif | |
13749 /* If it's a subclass, we don't really know what putting | |
13750 it in the interned dict might do. */ | |
13751 if (!PyUnicode_CheckExact(s)) | |
13752 return; | |
13753 if (PyUnicode_CHECK_INTERNED(s)) | |
13754 return; | |
13755 if (interned == NULL) { | |
13756 interned = PyDict_New(); | |
13757 if (interned == NULL) { | |
13758 PyErr_Clear(); /* Don't leave an exception */ | |
13759 return; | |
13760 } | |
13761 } | |
13762 /* It might be that the GetItem call fails even | |
13763 though the key is present in the dictionary, | |
13764 namely when this happens during a stack overflow. */ | |
13765 Py_ALLOW_RECURSION | |
13766 t = PyDict_GetItem(interned, s); | |
13767 Py_END_ALLOW_RECURSION | |
13768 | |
13769 if (t) { | |
13770 Py_INCREF(t); | |
13771 Py_DECREF(*p); | |
13772 *p = t; | |
13773 return; | |
13774 } | |
13775 | |
13776 PyThreadState_GET()->recursion_critical = 1; | |
13777 if (PyDict_SetItem(interned, s, s) < 0) { | |
13778 PyErr_Clear(); | |
13779 PyThreadState_GET()->recursion_critical = 0; | |
13780 return; | |
13781 } | |
13782 PyThreadState_GET()->recursion_critical = 0; | |
13783 /* The two references in interned are not counted by refcnt. | |
13784 The deallocator will take care of this */ | |
13785 Py_REFCNT(s) -= 2; | |
13786 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; | |
13787 } | |
13788 | |
13789 void | |
13790 PyUnicode_InternImmortal(PyObject **p) | |
13791 { | |
13792 PyUnicode_InternInPlace(p); | |
13793 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { | |
13794 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; | |
13795 Py_INCREF(*p); | |
13796 } | |
13797 } | |
13798 | |
13799 PyObject * | |
13800 PyUnicode_InternFromString(const char *cp) | |
13801 { | |
13802 PyObject *s = PyUnicode_FromString(cp); | |
13803 if (s == NULL) | |
13804 return NULL; | |
13805 PyUnicode_InternInPlace(&s); | |
13806 return s; | |
13807 } | |
13808 | |
13809 void | |
13810 _Py_ReleaseInternedUnicodeStrings(void) | |
13811 { | |
13812 PyObject *keys; | |
13813 PyObject *s; | |
13814 Py_ssize_t i, n; | |
13815 Py_ssize_t immortal_size = 0, mortal_size = 0; | |
13816 | |
13817 if (interned == NULL || !PyDict_Check(interned)) | |
13818 return; | |
13819 keys = PyDict_Keys(interned); | |
13820 if (keys == NULL || !PyList_Check(keys)) { | |
13821 PyErr_Clear(); | |
13822 return; | |
13823 } | |
13824 | |
13825 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak | |
13826 detector, interned unicode strings are not forcibly deallocated; | |
13827 rather, we give them their stolen references back, and then clear | |
13828 and DECREF the interned dict. */ | |
13829 | |
13830 n = PyList_GET_SIZE(keys); | |
13831 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", | |
13832 n); | |
13833 for (i = 0; i < n; i++) { | |
13834 s = PyList_GET_ITEM(keys, i); | |
13835 if (PyUnicode_READY(s) == -1) { | |
13836 assert(0 && "could not ready string"); | |
13837 fprintf(stderr, "could not ready string\n"); | |
13838 } | |
13839 switch (PyUnicode_CHECK_INTERNED(s)) { | |
13840 case SSTATE_NOT_INTERNED: | |
13841 /* XXX Shouldn't happen */ | |
13842 break; | |
13843 case SSTATE_INTERNED_IMMORTAL: | |
13844 Py_REFCNT(s) += 1; | |
13845 immortal_size += PyUnicode_GET_LENGTH(s); | |
13846 break; | |
13847 case SSTATE_INTERNED_MORTAL: | |
13848 Py_REFCNT(s) += 2; | |
13849 mortal_size += PyUnicode_GET_LENGTH(s); | |
13850 break; | |
13851 default: | |
13852 Py_FatalError("Inconsistent interned string state."); | |
13853 } | |
13854 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; | |
13855 } | |
13856 fprintf(stderr, "total size of all interned strings: " | |
13857 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " | |
13858 "mortal/immortal\n", mortal_size, immortal_size); | |
13859 Py_DECREF(keys); | |
13860 PyDict_Clear(interned); | |
13861 Py_DECREF(interned); | |
13862 interned = NULL; | |
13863 } | |
13864 | |
13865 | |
13866 /********************* Unicode Iterator **************************/ | |
13867 | |
13868 typedef struct { | |
13869 PyObject_HEAD | |
13870 Py_ssize_t it_index; | |
13871 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ | |
13872 } unicodeiterobject; | |
13873 | |
13874 static void | |
13875 unicodeiter_dealloc(unicodeiterobject *it) | |
13876 { | |
13877 _PyObject_GC_UNTRACK(it); | |
13878 Py_XDECREF(it->it_seq); | |
13879 PyObject_GC_Del(it); | |
13880 } | |
13881 | |
13882 static int | |
13883 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) | |
13884 { | |
13885 Py_VISIT(it->it_seq); | |
13886 return 0; | |
13887 } | |
13888 | |
13889 static PyObject * | |
13890 unicodeiter_next(unicodeiterobject *it) | |
13891 { | |
13892 PyObject *seq, *item; | |
13893 | |
13894 assert(it != NULL); | |
13895 seq = it->it_seq; | |
13896 if (seq == NULL) | |
13897 return NULL; | |
13898 assert(_PyUnicode_CHECK(seq)); | |
13899 | |
13900 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { | |
13901 int kind = PyUnicode_KIND(seq); | |
13902 void *data = PyUnicode_DATA(seq); | |
13903 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); | |
13904 item = PyUnicode_FromOrdinal(chr); | |
13905 if (item != NULL) | |
13906 ++it->it_index; | |
13907 return item; | |
13908 } | |
13909 | |
13910 Py_DECREF(seq); | |
13911 it->it_seq = NULL; | |
13912 return NULL; | |
13913 } | |
13914 | |
13915 static PyObject * | |
13916 unicodeiter_len(unicodeiterobject *it) | |
13917 { | |
13918 Py_ssize_t len = 0; | |
13919 if (it->it_seq) | |
13920 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; | |
13921 return PyLong_FromSsize_t(len); | |
13922 } | |
13923 | |
13924 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); | |
13925 | |
13926 static PyMethodDef unicodeiter_methods[] = { | |
13927 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, | |
13928 length_hint_doc}, | |
13929 {NULL, NULL} /* sentinel */ | |
13930 }; | |
13931 | |
13932 PyTypeObject PyUnicodeIter_Type = { | |
13933 PyVarObject_HEAD_INIT(&PyType_Type, 0) | |
13934 "str_iterator", /* tp_name */ | |
13935 sizeof(unicodeiterobject), /* tp_basicsize */ | |
13936 0, /* tp_itemsize */ | |
13937 /* methods */ | |
13938 (destructor)unicodeiter_dealloc, /* tp_dealloc */ | |
13939 0, /* tp_print */ | |
13940 0, /* tp_getattr */ | |
13941 0, /* tp_setattr */ | |
13942 0, /* tp_reserved */ | |
13943 0, /* tp_repr */ | |
13944 0, /* tp_as_number */ | |
13945 0, /* tp_as_sequence */ | |
13946 0, /* tp_as_mapping */ | |
13947 0, /* tp_hash */ | |
13948 0, /* tp_call */ | |
13949 0, /* tp_str */ | |
13950 PyObject_GenericGetAttr, /* tp_getattro */ | |
13951 0, /* tp_setattro */ | |
13952 0, /* tp_as_buffer */ | |
13953 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ | |
13954 0, /* tp_doc */ | |
13955 (traverseproc)unicodeiter_traverse, /* tp_traverse */ | |
13956 0, /* tp_clear */ | |
13957 0, /* tp_richcompare */ | |
13958 0, /* tp_weaklistoffset */ | |
13959 PyObject_SelfIter, /* tp_iter */ | |
13960 (iternextfunc)unicodeiter_next, /* tp_iternext */ | |
13961 unicodeiter_methods, /* tp_methods */ | |
13962 0, | |
13963 }; | |
13964 | |
13965 static PyObject * | |
13966 unicode_iter(PyObject *seq) | |
13967 { | |
13968 unicodeiterobject *it; | |
13969 | |
13970 if (!PyUnicode_Check(seq)) { | |
13971 PyErr_BadInternalCall(); | |
13972 return NULL; | |
13973 } | |
13974 if (PyUnicode_READY(seq) == -1) | |
13975 return NULL; | |
13976 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); | |
13977 if (it == NULL) | |
13978 return NULL; | |
13979 it->it_index = 0; | |
13980 Py_INCREF(seq); | |
13981 it->it_seq = seq; | |
13982 _PyObject_GC_TRACK(it); | |
13983 return (PyObject *)it; | |
13984 } | |
13985 | |
13986 | |
13987 size_t | |
13988 Py_UNICODE_strlen(const Py_UNICODE *u) | |
13989 { | |
13990 int res = 0; | |
13991 while(*u++) | |
13992 res++; | |
13993 return res; | |
13994 } | |
13995 | |
13996 Py_UNICODE* | |
13997 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) | |
13998 { | |
13999 Py_UNICODE *u = s1; | |
14000 while ((*u++ = *s2++)); | |
14001 return s1; | |
14002 } | |
14003 | |
14004 Py_UNICODE* | |
14005 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) | |
14006 { | |
14007 Py_UNICODE *u = s1; | |
14008 while ((*u++ = *s2++)) | |
14009 if (n-- == 0) | |
14010 break; | |
14011 return s1; | |
14012 } | |
14013 | |
14014 Py_UNICODE* | |
14015 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) | |
14016 { | |
14017 Py_UNICODE *u1 = s1; | |
14018 u1 += Py_UNICODE_strlen(u1); | |
14019 Py_UNICODE_strcpy(u1, s2); | |
14020 return s1; | |
14021 } | |
14022 | |
14023 int | |
14024 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) | |
14025 { | |
14026 while (*s1 && *s2 && *s1 == *s2) | |
14027 s1++, s2++; | |
14028 if (*s1 && *s2) | |
14029 return (*s1 < *s2) ? -1 : +1; | |
14030 if (*s1) | |
14031 return 1; | |
14032 if (*s2) | |
14033 return -1; | |
14034 return 0; | |
14035 } | |
14036 | |
14037 int | |
14038 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) | |
14039 { | |
14040 register Py_UNICODE u1, u2; | |
14041 for (; n != 0; n--) { | |
14042 u1 = *s1; | |
14043 u2 = *s2; | |
14044 if (u1 != u2) | |
14045 return (u1 < u2) ? -1 : +1; | |
14046 if (u1 == '\0') | |
14047 return 0; | |
14048 s1++; | |
14049 s2++; | |
14050 } | |
14051 return 0; | |
14052 } | |
14053 | |
14054 Py_UNICODE* | |
14055 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) | |
14056 { | |
14057 const Py_UNICODE *p; | |
14058 for (p = s; *p; p++) | |
14059 if (*p == c) | |
14060 return (Py_UNICODE*)p; | |
14061 return NULL; | |
14062 } | |
14063 | |
14064 Py_UNICODE* | |
14065 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) | |
14066 { | |
14067 const Py_UNICODE *p; | |
14068 p = s + Py_UNICODE_strlen(s); | |
14069 while (p != s) { | |
14070 p--; | |
14071 if (*p == c) | |
14072 return (Py_UNICODE*)p; | |
14073 } | |
14074 return NULL; | |
14075 } | |
14076 | |
14077 Py_UNICODE* | |
14078 PyUnicode_AsUnicodeCopy(PyObject *unicode) | |
14079 { | |
14080 Py_UNICODE *u, *copy; | |
14081 Py_ssize_t len, size; | |
14082 | |
14083 if (!PyUnicode_Check(unicode)) { | |
14084 PyErr_BadArgument(); | |
14085 return NULL; | |
14086 } | |
14087 u = PyUnicode_AsUnicodeAndSize(unicode, &len); | |
14088 if (u == NULL) | |
14089 return NULL; | |
14090 /* Ensure we won't overflow the size. */ | |
14091 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { | |
14092 PyErr_NoMemory(); | |
14093 return NULL; | |
14094 } | |
14095 size = len + 1; /* copy the null character */ | |
14096 size *= sizeof(Py_UNICODE); | |
14097 copy = PyMem_Malloc(size); | |
14098 if (copy == NULL) { | |
14099 PyErr_NoMemory(); | |
14100 return NULL; | |
14101 } | |
14102 memcpy(copy, u, size); | |
14103 return copy; | |
14104 } | |
14105 | |
14106 /* A _string module, to export formatter_parser and formatter_field_name_split | |
14107 to the string.Formatter class implemented in Python. */ | |
14108 | |
14109 static PyMethodDef _string_methods[] = { | |
14110 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, | |
14111 METH_O, PyDoc_STR("split the argument as a field name")}, | |
14112 {"formatter_parser", (PyCFunction) formatter_parser, | |
14113 METH_O, PyDoc_STR("parse the argument as a format string")}, | |
14114 {NULL, NULL} | |
14115 }; | |
14116 | |
14117 static struct PyModuleDef _string_module = { | |
14118 PyModuleDef_HEAD_INIT, | |
14119 "_string", | |
14120 PyDoc_STR("string helper module"), | |
14121 0, | |
14122 _string_methods, | |
14123 NULL, | |
14124 NULL, | |
14125 NULL, | |
14126 NULL | |
14127 }; | |
14128 | |
14129 PyMODINIT_FUNC | |
14130 PyInit__string(void) | |
14131 { | |
14132 return PyModule_Create(&_string_module); | |
14133 } | |
14134 | |
14135 |