comparison cos/python/Objects/unicodeobject.c @ 27:7f74363f4c82

Added some files for the python port
author windel
date Tue, 27 Dec 2011 18:59:02 +0100
parents
children
comparison
equal deleted inserted replaced
26:dcce92b1efbc 27:7f74363f4c82
1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "ucnhash.h"
44
45 /* Endianness switches; defaults to little endian */
46
47 #ifdef WORDS_BIGENDIAN
48 # define BYTEORDER_IS_BIG_ENDIAN
49 #else
50 # define BYTEORDER_IS_LITTLE_ENDIAN
51 #endif
52
53 /* --- Globals ------------------------------------------------------------
54
55 The globals are initialized by the _PyUnicode_Init() API and should
56 not be used before calling that API.
57
58 */
59
60
61 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
62 #define MAX_UNICODE 0x10ffff
63
64 #ifdef Py_DEBUG
65 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
66 #else
67 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
68 #endif
69
70 #define _PyUnicode_UTF8(op) \
71 (((PyCompactUnicodeObject*)(op))->utf8)
72 #define PyUnicode_UTF8(op) \
73 (assert(_PyUnicode_CHECK(op)), \
74 assert(PyUnicode_IS_READY(op)), \
75 PyUnicode_IS_COMPACT_ASCII(op) ? \
76 ((char*)((PyASCIIObject*)(op) + 1)) : \
77 _PyUnicode_UTF8(op))
78 #define _PyUnicode_UTF8_LENGTH(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8_length)
80 #define PyUnicode_UTF8_LENGTH(op) \
81 (assert(_PyUnicode_CHECK(op)), \
82 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((PyASCIIObject*)(op))->length : \
85 _PyUnicode_UTF8_LENGTH(op))
86 #define _PyUnicode_WSTR(op) \
87 (((PyASCIIObject*)(op))->wstr)
88 #define _PyUnicode_WSTR_LENGTH(op) \
89 (((PyCompactUnicodeObject*)(op))->wstr_length)
90 #define _PyUnicode_LENGTH(op) \
91 (((PyASCIIObject *)(op))->length)
92 #define _PyUnicode_STATE(op) \
93 (((PyASCIIObject *)(op))->state)
94 #define _PyUnicode_HASH(op) \
95 (((PyASCIIObject *)(op))->hash)
96 #define _PyUnicode_KIND(op) \
97 (assert(_PyUnicode_CHECK(op)), \
98 ((PyASCIIObject *)(op))->state.kind)
99 #define _PyUnicode_GET_LENGTH(op) \
100 (assert(_PyUnicode_CHECK(op)), \
101 ((PyASCIIObject *)(op))->length)
102 #define _PyUnicode_DATA_ANY(op) \
103 (((PyUnicodeObject*)(op))->data.any)
104
105 #undef PyUnicode_READY
106 #define PyUnicode_READY(op) \
107 (assert(_PyUnicode_CHECK(op)), \
108 (PyUnicode_IS_READY(op) ? \
109 0 : \
110 _PyUnicode_Ready(op)))
111
112 #define _PyUnicode_SHARE_UTF8(op) \
113 (assert(_PyUnicode_CHECK(op)), \
114 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
115 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
116 #define _PyUnicode_SHARE_WSTR(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
119
120 /* true if the Unicode object has an allocated UTF-8 memory block
121 (not shared with other data) */
122 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 (!PyUnicode_IS_COMPACT_ASCII(op) \
125 && _PyUnicode_UTF8(op) \
126 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
127
128 /* true if the Unicode object has an allocated wstr memory block
129 (not shared with other data) */
130 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(op) && \
133 (!PyUnicode_IS_READY(op) || \
134 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
135
136 /* Generic helper macro to convert characters of different types.
137 from_type and to_type have to be valid type names, begin and end
138 are pointers to the source characters which should be of type
139 "from_type *". to is a pointer of type "to_type *" and points to the
140 buffer where the result characters are written to. */
141 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
142 do { \
143 to_type *_to = (to_type *) to; \
144 const from_type *_iter = (begin); \
145 const from_type *_end = (end); \
146 Py_ssize_t n = (_end) - (_iter); \
147 const from_type *_unrolled_end = \
148 _iter + (n & ~ (Py_ssize_t) 3); \
149 while (_iter < (_unrolled_end)) { \
150 _to[0] = (to_type) _iter[0]; \
151 _to[1] = (to_type) _iter[1]; \
152 _to[2] = (to_type) _iter[2]; \
153 _to[3] = (to_type) _iter[3]; \
154 _iter += 4; _to += 4; \
155 } \
156 while (_iter < (_end)) \
157 *_to++ = (to_type) *_iter++; \
158 } while (0)
159
160 /* The Unicode string has been modified: reset the hash */
161 #define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
162
163 /* This dictionary holds all interned unicode strings. Note that references
164 to strings in this dictionary are *not* counted in the string's ob_refcnt.
165 When the interned string reaches a refcnt of 0 the string deallocation
166 function will delete the reference from this dictionary.
167
168 Another way to look at this is that to say that the actual reference
169 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
170 */
171 static PyObject *interned;
172
173 /* The empty Unicode object is shared to improve performance. */
174 static PyObject *unicode_empty;
175
176 /* List of static strings. */
177 static _Py_Identifier *static_strings;
178
179 /* Single character Unicode strings in the Latin-1 range are being
180 shared as well. */
181 static PyObject *unicode_latin1[256];
182
183 /* Fast detection of the most frequent whitespace characters */
184 const unsigned char _Py_ascii_whitespace[] = {
185 0, 0, 0, 0, 0, 0, 0, 0,
186 /* case 0x0009: * CHARACTER TABULATION */
187 /* case 0x000A: * LINE FEED */
188 /* case 0x000B: * LINE TABULATION */
189 /* case 0x000C: * FORM FEED */
190 /* case 0x000D: * CARRIAGE RETURN */
191 0, 1, 1, 1, 1, 1, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 /* case 0x001C: * FILE SEPARATOR */
194 /* case 0x001D: * GROUP SEPARATOR */
195 /* case 0x001E: * RECORD SEPARATOR */
196 /* case 0x001F: * UNIT SEPARATOR */
197 0, 0, 0, 0, 1, 1, 1, 1,
198 /* case 0x0020: * SPACE */
199 1, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0
212 };
213
214 /* forward */
215 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
216 static PyObject* get_latin1_char(unsigned char ch);
217 static void copy_characters(
218 PyObject *to, Py_ssize_t to_start,
219 PyObject *from, Py_ssize_t from_start,
220 Py_ssize_t how_many);
221
222 static PyObject *
223 unicode_fromascii(const unsigned char *s, Py_ssize_t size);
224 static PyObject *
225 _PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
226 static PyObject *
227 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
228 static PyObject *
229 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
230
231 static PyObject *
232 unicode_encode_call_errorhandler(const char *errors,
233 PyObject **errorHandler,const char *encoding, const char *reason,
234 PyObject *unicode, PyObject **exceptionObject,
235 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
236
237 static void
238 raise_encode_exception(PyObject **exceptionObject,
239 const char *encoding,
240 PyObject *unicode,
241 Py_ssize_t startpos, Py_ssize_t endpos,
242 const char *reason);
243
244 /* Same for linebreaks */
245 static unsigned char ascii_linebreak[] = {
246 0, 0, 0, 0, 0, 0, 0, 0,
247 /* 0x000A, * LINE FEED */
248 /* 0x000B, * LINE TABULATION */
249 /* 0x000C, * FORM FEED */
250 /* 0x000D, * CARRIAGE RETURN */
251 0, 0, 1, 1, 1, 1, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0,
253 /* 0x001C, * FILE SEPARATOR */
254 /* 0x001D, * GROUP SEPARATOR */
255 /* 0x001E, * RECORD SEPARATOR */
256 0, 0, 0, 0, 1, 1, 1, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0
270 };
271
272 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
273 This function is kept for backward compatibility with the old API. */
274 Py_UNICODE
275 PyUnicode_GetMax(void)
276 {
277 #ifdef Py_UNICODE_WIDE
278 return 0x10FFFF;
279 #else
280 /* This is actually an illegal character, so it should
281 not be passed to unichr. */
282 return 0xFFFF;
283 #endif
284 }
285
286 #ifdef Py_DEBUG
287 int
288 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
289 {
290 PyASCIIObject *ascii;
291 unsigned int kind;
292
293 assert(PyUnicode_Check(op));
294
295 ascii = (PyASCIIObject *)op;
296 kind = ascii->state.kind;
297
298 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
299 assert(kind == PyUnicode_1BYTE_KIND);
300 assert(ascii->state.ready == 1);
301 }
302 else {
303 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
304 void *data;
305
306 if (ascii->state.compact == 1) {
307 data = compact + 1;
308 assert(kind == PyUnicode_1BYTE_KIND
309 || kind == PyUnicode_2BYTE_KIND
310 || kind == PyUnicode_4BYTE_KIND);
311 assert(ascii->state.ascii == 0);
312 assert(ascii->state.ready == 1);
313 assert (compact->utf8 != data);
314 }
315 else {
316 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
317
318 data = unicode->data.any;
319 if (kind == PyUnicode_WCHAR_KIND) {
320 assert(ascii->length == 0);
321 assert(ascii->hash == -1);
322 assert(ascii->state.compact == 0);
323 assert(ascii->state.ascii == 0);
324 assert(ascii->state.ready == 0);
325 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
326 assert(ascii->wstr != NULL);
327 assert(data == NULL);
328 assert(compact->utf8 == NULL);
329 }
330 else {
331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
334 assert(ascii->state.compact == 0);
335 assert(ascii->state.ready == 1);
336 assert(data != NULL);
337 if (ascii->state.ascii) {
338 assert (compact->utf8 == data);
339 assert (compact->utf8_length == ascii->length);
340 }
341 else
342 assert (compact->utf8 != data);
343 }
344 }
345 if (kind != PyUnicode_WCHAR_KIND) {
346 if (
347 kind == PyUnicode_4BYTE_KIND
348 )
349 {
350 assert(ascii->wstr == data);
351 assert(compact->wstr_length == ascii->length);
352 } else
353 assert(ascii->wstr != data);
354 }
355
356 if (compact->utf8 == NULL)
357 assert(compact->utf8_length == 0);
358 if (ascii->wstr == NULL)
359 assert(compact->wstr_length == 0);
360 }
361 /* check that the best kind is used */
362 if (check_content && kind != PyUnicode_WCHAR_KIND)
363 {
364 Py_ssize_t i;
365 Py_UCS4 maxchar = 0;
366 void *data = PyUnicode_DATA(ascii);
367 for (i=0; i < ascii->length; i++)
368 {
369 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
370 if (ch > maxchar)
371 maxchar = ch;
372 }
373 if (kind == PyUnicode_1BYTE_KIND) {
374 if (ascii->state.ascii == 0) {
375 assert(maxchar >= 128);
376 assert(maxchar <= 255);
377 }
378 else
379 assert(maxchar < 128);
380 }
381 else if (kind == PyUnicode_2BYTE_KIND) {
382 assert(maxchar >= 0x100);
383 assert(maxchar <= 0xFFFF);
384 }
385 else {
386 assert(maxchar >= 0x10000);
387 assert(maxchar <= MAX_UNICODE);
388 }
389 }
390 return 1;
391 }
392 #endif
393
394 static PyObject*
395 unicode_result_wchar(PyObject *unicode)
396 {
397 #ifndef Py_DEBUG
398 Py_ssize_t len;
399
400 assert(Py_REFCNT(unicode) == 1);
401
402 len = _PyUnicode_WSTR_LENGTH(unicode);
403 if (len == 0) {
404 Py_INCREF(unicode_empty);
405 Py_DECREF(unicode);
406 return unicode_empty;
407 }
408
409 if (len == 1) {
410 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
411 if (ch < 256) {
412 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
413 Py_DECREF(unicode);
414 return latin1_char;
415 }
416 }
417
418 if (_PyUnicode_Ready(unicode) < 0) {
419 Py_XDECREF(unicode);
420 return NULL;
421 }
422 #else
423 /* don't make the result ready in debug mode to ensure that the caller
424 makes the string ready before using it */
425 assert(_PyUnicode_CheckConsistency(unicode, 1));
426 #endif
427 return unicode;
428 }
429
430 static PyObject*
431 unicode_result_ready(PyObject *unicode)
432 {
433 Py_ssize_t length;
434
435 length = PyUnicode_GET_LENGTH(unicode);
436 if (length == 0) {
437 if (unicode != unicode_empty) {
438 Py_INCREF(unicode_empty);
439 Py_DECREF(unicode);
440 }
441 return unicode_empty;
442 }
443
444 if (length == 1) {
445 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
446 if (ch < 256) {
447 PyObject *latin1_char = unicode_latin1[ch];
448 if (latin1_char != NULL) {
449 if (unicode != latin1_char) {
450 Py_INCREF(latin1_char);
451 Py_DECREF(unicode);
452 }
453 return latin1_char;
454 }
455 else {
456 assert(_PyUnicode_CheckConsistency(unicode, 1));
457 Py_INCREF(unicode);
458 unicode_latin1[ch] = unicode;
459 return unicode;
460 }
461 }
462 }
463
464 assert(_PyUnicode_CheckConsistency(unicode, 1));
465 return unicode;
466 }
467
468 static PyObject*
469 unicode_result(PyObject *unicode)
470 {
471 assert(_PyUnicode_CHECK(unicode));
472 if (PyUnicode_IS_READY(unicode))
473 return unicode_result_ready(unicode);
474 else
475 return unicode_result_wchar(unicode);
476 }
477
478 #ifdef HAVE_MBCS
479 static OSVERSIONINFOEX winver;
480 #endif
481
482 /* --- Bloom Filters ----------------------------------------------------- */
483
484 /* stuff to implement simple "bloom filters" for Unicode characters.
485 to keep things simple, we use a single bitmask, using the least 5
486 bits from each unicode characters as the bit index. */
487
488 /* the linebreak mask is set up by Unicode_Init below */
489
490 #if LONG_BIT >= 128
491 #define BLOOM_WIDTH 128
492 #elif LONG_BIT >= 64
493 #define BLOOM_WIDTH 64
494 #elif LONG_BIT >= 32
495 #define BLOOM_WIDTH 32
496 #else
497 #error "LONG_BIT is smaller than 32"
498 #endif
499
500 #define BLOOM_MASK unsigned long
501
502 static BLOOM_MASK bloom_linebreak;
503
504 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
505 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
506
507 #define BLOOM_LINEBREAK(ch) \
508 ((ch) < 128U ? ascii_linebreak[(ch)] : \
509 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
510
511 Py_LOCAL_INLINE(BLOOM_MASK)
512 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
513 {
514 /* calculate simple bloom-style bitmask for a given unicode string */
515
516 BLOOM_MASK mask;
517 Py_ssize_t i;
518
519 mask = 0;
520 for (i = 0; i < len; i++)
521 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
522
523 return mask;
524 }
525
526 #define BLOOM_MEMBER(mask, chr, str) \
527 (BLOOM(mask, chr) \
528 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
529
530 /* Compilation of templated routines */
531
532 #include "stringlib/asciilib.h"
533 #include "stringlib/fastsearch.h"
534 #include "stringlib/partition.h"
535 #include "stringlib/split.h"
536 #include "stringlib/count.h"
537 #include "stringlib/find.h"
538 #include "stringlib/find_max_char.h"
539 #include "stringlib/localeutil.h"
540 #include "stringlib/undef.h"
541
542 #include "stringlib/ucs1lib.h"
543 #include "stringlib/fastsearch.h"
544 #include "stringlib/partition.h"
545 #include "stringlib/split.h"
546 #include "stringlib/count.h"
547 #include "stringlib/find.h"
548 #include "stringlib/find_max_char.h"
549 #include "stringlib/localeutil.h"
550 #include "stringlib/undef.h"
551
552 #include "stringlib/ucs2lib.h"
553 #include "stringlib/fastsearch.h"
554 #include "stringlib/partition.h"
555 #include "stringlib/split.h"
556 #include "stringlib/count.h"
557 #include "stringlib/find.h"
558 #include "stringlib/find_max_char.h"
559 #include "stringlib/localeutil.h"
560 #include "stringlib/undef.h"
561
562 #include "stringlib/ucs4lib.h"
563 #include "stringlib/fastsearch.h"
564 #include "stringlib/partition.h"
565 #include "stringlib/split.h"
566 #include "stringlib/count.h"
567 #include "stringlib/find.h"
568 #include "stringlib/find_max_char.h"
569 #include "stringlib/localeutil.h"
570 #include "stringlib/undef.h"
571
572 #include "stringlib/unicodedefs.h"
573 #include "stringlib/fastsearch.h"
574 #include "stringlib/count.h"
575 #include "stringlib/find.h"
576 #include "stringlib/undef.h"
577
578 /* --- Unicode Object ----------------------------------------------------- */
579
580 static PyObject *
581 fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
582
583 Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
584 Py_ssize_t size, Py_UCS4 ch,
585 int direction)
586 {
587 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
588
589 switch (kind) {
590 case PyUnicode_1BYTE_KIND:
591 {
592 Py_UCS1 ch1 = (Py_UCS1) ch;
593 if (ch1 == ch)
594 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
595 else
596 return -1;
597 }
598 case PyUnicode_2BYTE_KIND:
599 {
600 Py_UCS2 ch2 = (Py_UCS2) ch;
601 if (ch2 == ch)
602 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
603 else
604 return -1;
605 }
606 case PyUnicode_4BYTE_KIND:
607 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
608 default:
609 assert(0);
610 return -1;
611 }
612 }
613
614 static PyObject*
615 resize_compact(PyObject *unicode, Py_ssize_t length)
616 {
617 Py_ssize_t char_size;
618 Py_ssize_t struct_size;
619 Py_ssize_t new_size;
620 int share_wstr;
621
622 assert(PyUnicode_IS_READY(unicode));
623 char_size = PyUnicode_KIND(unicode);
624 if (PyUnicode_IS_COMPACT_ASCII(unicode))
625 struct_size = sizeof(PyASCIIObject);
626 else
627 struct_size = sizeof(PyCompactUnicodeObject);
628 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
629
630 _Py_DEC_REFTOTAL;
631 _Py_ForgetReference(unicode);
632
633 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
634 PyErr_NoMemory();
635 return NULL;
636 }
637 new_size = (struct_size + (length + 1) * char_size);
638
639 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
640 if (unicode == NULL) {
641 PyObject_Del(unicode);
642 PyErr_NoMemory();
643 return NULL;
644 }
645 _Py_NewReference(unicode);
646 _PyUnicode_LENGTH(unicode) = length;
647 if (share_wstr) {
648 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
649 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
650 _PyUnicode_WSTR_LENGTH(unicode) = length;
651 }
652 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
653 length, 0);
654 return unicode;
655 }
656
657 static int
658 resize_inplace(PyObject *unicode, Py_ssize_t length)
659 {
660 wchar_t *wstr;
661 assert(!PyUnicode_IS_COMPACT(unicode));
662 assert(Py_REFCNT(unicode) == 1);
663
664 _PyUnicode_DIRTY(unicode);
665
666 if (PyUnicode_IS_READY(unicode)) {
667 Py_ssize_t char_size;
668 Py_ssize_t new_size;
669 int share_wstr, share_utf8;
670 void *data;
671
672 data = _PyUnicode_DATA_ANY(unicode);
673 assert(data != NULL);
674 char_size = PyUnicode_KIND(unicode);
675 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
676 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
677 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
678 {
679 PyObject_DEL(_PyUnicode_UTF8(unicode));
680 _PyUnicode_UTF8(unicode) = NULL;
681 _PyUnicode_UTF8_LENGTH(unicode) = 0;
682 }
683
684 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
685 PyErr_NoMemory();
686 return -1;
687 }
688 new_size = (length + 1) * char_size;
689
690 data = (PyObject *)PyObject_REALLOC(data, new_size);
691 if (data == NULL) {
692 PyErr_NoMemory();
693 return -1;
694 }
695 _PyUnicode_DATA_ANY(unicode) = data;
696 if (share_wstr) {
697 _PyUnicode_WSTR(unicode) = data;
698 _PyUnicode_WSTR_LENGTH(unicode) = length;
699 }
700 if (share_utf8) {
701 _PyUnicode_UTF8(unicode) = data;
702 _PyUnicode_UTF8_LENGTH(unicode) = length;
703 }
704 _PyUnicode_LENGTH(unicode) = length;
705 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
706 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
707 assert(_PyUnicode_CheckConsistency(unicode, 0));
708 return 0;
709 }
710 }
711 assert(_PyUnicode_WSTR(unicode) != NULL);
712
713 /* check for integer overflow */
714 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
715 PyErr_NoMemory();
716 return -1;
717 }
718 wstr = _PyUnicode_WSTR(unicode);
719 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
720 if (!wstr) {
721 PyErr_NoMemory();
722 return -1;
723 }
724 _PyUnicode_WSTR(unicode) = wstr;
725 _PyUnicode_WSTR(unicode)[length] = 0;
726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 assert(_PyUnicode_CheckConsistency(unicode, 0));
728 return 0;
729 }
730
731 static PyObject*
732 resize_copy(PyObject *unicode, Py_ssize_t length)
733 {
734 Py_ssize_t copy_length;
735 if (PyUnicode_IS_COMPACT(unicode)) {
736 PyObject *copy;
737 assert(PyUnicode_IS_READY(unicode));
738
739 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
740 if (copy == NULL)
741 return NULL;
742
743 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
744 copy_characters(copy, 0, unicode, 0, copy_length);
745 return copy;
746 }
747 else {
748 PyObject *w;
749 assert(_PyUnicode_WSTR(unicode) != NULL);
750 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
751 w = (PyObject*)_PyUnicode_New(length);
752 if (w == NULL)
753 return NULL;
754 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
755 copy_length = Py_MIN(copy_length, length);
756 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
757 copy_length);
758 return w;
759 }
760 }
761
762 /* We allocate one more byte to make sure the string is
763 Ux0000 terminated; some code (e.g. new_identifier)
764 relies on that.
765
766 XXX This allocator could further be enhanced by assuring that the
767 free list never reduces its size below 1.
768
769 */
770
771 #ifdef Py_DEBUG
772 static int unicode_old_new_calls = 0;
773 #endif
774
775 static PyUnicodeObject *
776 _PyUnicode_New(Py_ssize_t length)
777 {
778 register PyUnicodeObject *unicode;
779 size_t new_size;
780
781 /* Optimization for empty strings */
782 if (length == 0 && unicode_empty != NULL) {
783 Py_INCREF(unicode_empty);
784 return (PyUnicodeObject*)unicode_empty;
785 }
786
787 /* Ensure we won't overflow the size. */
788 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
789 return (PyUnicodeObject *)PyErr_NoMemory();
790 }
791 if (length < 0) {
792 PyErr_SetString(PyExc_SystemError,
793 "Negative size passed to _PyUnicode_New");
794 return NULL;
795 }
796
797 #ifdef Py_DEBUG
798 ++unicode_old_new_calls;
799 #endif
800
801 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
802 if (unicode == NULL)
803 return NULL;
804 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
805 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
806 if (!_PyUnicode_WSTR(unicode)) {
807 PyErr_NoMemory();
808 goto onError;
809 }
810
811 /* Initialize the first element to guard against cases where
812 * the caller fails before initializing str -- unicode_resize()
813 * reads str[0], and the Keep-Alive optimization can keep memory
814 * allocated for str alive across a call to unicode_dealloc(unicode).
815 * We don't want unicode_resize to read uninitialized memory in
816 * that case.
817 */
818 _PyUnicode_WSTR(unicode)[0] = 0;
819 _PyUnicode_WSTR(unicode)[length] = 0;
820 _PyUnicode_WSTR_LENGTH(unicode) = length;
821 _PyUnicode_HASH(unicode) = -1;
822 _PyUnicode_STATE(unicode).interned = 0;
823 _PyUnicode_STATE(unicode).kind = 0;
824 _PyUnicode_STATE(unicode).compact = 0;
825 _PyUnicode_STATE(unicode).ready = 0;
826 _PyUnicode_STATE(unicode).ascii = 0;
827 _PyUnicode_DATA_ANY(unicode) = NULL;
828 _PyUnicode_LENGTH(unicode) = 0;
829 _PyUnicode_UTF8(unicode) = NULL;
830 _PyUnicode_UTF8_LENGTH(unicode) = 0;
831 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
832 return unicode;
833
834 onError:
835 /* XXX UNREF/NEWREF interface should be more symmetrical */
836 _Py_DEC_REFTOTAL;
837 _Py_ForgetReference((PyObject *)unicode);
838 PyObject_Del(unicode);
839 return NULL;
840 }
841
842 static const char*
843 unicode_kind_name(PyObject *unicode)
844 {
845 /* don't check consistency: unicode_kind_name() is called from
846 _PyUnicode_Dump() */
847 if (!PyUnicode_IS_COMPACT(unicode))
848 {
849 if (!PyUnicode_IS_READY(unicode))
850 return "wstr";
851 switch(PyUnicode_KIND(unicode))
852 {
853 case PyUnicode_1BYTE_KIND:
854 if (PyUnicode_IS_ASCII(unicode))
855 return "legacy ascii";
856 else
857 return "legacy latin1";
858 case PyUnicode_2BYTE_KIND:
859 return "legacy UCS2";
860 case PyUnicode_4BYTE_KIND:
861 return "legacy UCS4";
862 default:
863 return "<legacy invalid kind>";
864 }
865 }
866 assert(PyUnicode_IS_READY(unicode));
867 switch(PyUnicode_KIND(unicode))
868 {
869 case PyUnicode_1BYTE_KIND:
870 if (PyUnicode_IS_ASCII(unicode))
871 return "ascii";
872 else
873 return "latin1";
874 case PyUnicode_2BYTE_KIND:
875 return "UCS2";
876 case PyUnicode_4BYTE_KIND:
877 return "UCS4";
878 default:
879 return "<invalid compact kind>";
880 }
881 }
882
883 #ifdef Py_DEBUG
884 static int unicode_new_new_calls = 0;
885
886 /* Functions wrapping macros for use in debugger */
887 char *_PyUnicode_utf8(void *unicode){
888 return PyUnicode_UTF8(unicode);
889 }
890
891 void *_PyUnicode_compact_data(void *unicode) {
892 return _PyUnicode_COMPACT_DATA(unicode);
893 }
894 void *_PyUnicode_data(void *unicode){
895 printf("obj %p\n", unicode);
896 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
897 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
898 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
899 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
900 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
901 return PyUnicode_DATA(unicode);
902 }
903
904 void
905 _PyUnicode_Dump(PyObject *op)
906 {
907 PyASCIIObject *ascii = (PyASCIIObject *)op;
908 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
909 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
910 void *data;
911
912 if (ascii->state.compact)
913 {
914 if (ascii->state.ascii)
915 data = (ascii + 1);
916 else
917 data = (compact + 1);
918 }
919 else
920 data = unicode->data.any;
921 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
922
923 if (ascii->wstr == data)
924 printf("shared ");
925 printf("wstr=%p", ascii->wstr);
926
927 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
928 printf(" (%zu), ", compact->wstr_length);
929 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
930 printf("shared ");
931 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
932 }
933 printf(", data=%p\n", data);
934 }
935 #endif
936
937 PyObject *
938 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
939 {
940 PyObject *obj;
941 PyCompactUnicodeObject *unicode;
942 void *data;
943 int kind_state;
944 int is_sharing, is_ascii;
945 Py_ssize_t char_size;
946 Py_ssize_t struct_size;
947
948 /* Optimization for empty strings */
949 if (size == 0 && unicode_empty != NULL) {
950 Py_INCREF(unicode_empty);
951 return unicode_empty;
952 }
953
954 #ifdef Py_DEBUG
955 ++unicode_new_new_calls;
956 #endif
957
958 is_ascii = 0;
959 is_sharing = 0;
960 struct_size = sizeof(PyCompactUnicodeObject);
961 if (maxchar < 128) {
962 kind_state = PyUnicode_1BYTE_KIND;
963 char_size = 1;
964 is_ascii = 1;
965 struct_size = sizeof(PyASCIIObject);
966 }
967 else if (maxchar < 256) {
968 kind_state = PyUnicode_1BYTE_KIND;
969 char_size = 1;
970 }
971 else if (maxchar < 65536) {
972 kind_state = PyUnicode_2BYTE_KIND;
973 char_size = 2;
974 if (sizeof(wchar_t) == 2)
975 is_sharing = 1;
976 }
977 else {
978 kind_state = PyUnicode_4BYTE_KIND;
979 char_size = 4;
980 if (sizeof(wchar_t) == 4)
981 is_sharing = 1;
982 }
983
984 /* Ensure we won't overflow the size. */
985 if (size < 0) {
986 PyErr_SetString(PyExc_SystemError,
987 "Negative size passed to PyUnicode_New");
988 return NULL;
989 }
990 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
991 return PyErr_NoMemory();
992
993 /* Duplicated allocation code from _PyObject_New() instead of a call to
994 * PyObject_New() so we are able to allocate space for the object and
995 * it's data buffer.
996 */
997 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
998 if (obj == NULL)
999 return PyErr_NoMemory();
1000 obj = PyObject_INIT(obj, &PyUnicode_Type);
1001 if (obj == NULL)
1002 return NULL;
1003
1004 unicode = (PyCompactUnicodeObject *)obj;
1005 if (is_ascii)
1006 data = ((PyASCIIObject*)obj) + 1;
1007 else
1008 data = unicode + 1;
1009 _PyUnicode_LENGTH(unicode) = size;
1010 _PyUnicode_HASH(unicode) = -1;
1011 _PyUnicode_STATE(unicode).interned = 0;
1012 _PyUnicode_STATE(unicode).kind = kind_state;
1013 _PyUnicode_STATE(unicode).compact = 1;
1014 _PyUnicode_STATE(unicode).ready = 1;
1015 _PyUnicode_STATE(unicode).ascii = is_ascii;
1016 if (is_ascii) {
1017 ((char*)data)[size] = 0;
1018 _PyUnicode_WSTR(unicode) = NULL;
1019 }
1020 else if (kind_state == PyUnicode_1BYTE_KIND) {
1021 ((char*)data)[size] = 0;
1022 _PyUnicode_WSTR(unicode) = NULL;
1023 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1024 unicode->utf8 = NULL;
1025 unicode->utf8_length = 0;
1026 }
1027 else {
1028 unicode->utf8 = NULL;
1029 unicode->utf8_length = 0;
1030 if (kind_state == PyUnicode_2BYTE_KIND)
1031 ((Py_UCS2*)data)[size] = 0;
1032 else /* kind_state == PyUnicode_4BYTE_KIND */
1033 ((Py_UCS4*)data)[size] = 0;
1034 if (is_sharing) {
1035 _PyUnicode_WSTR_LENGTH(unicode) = size;
1036 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1037 }
1038 else {
1039 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1040 _PyUnicode_WSTR(unicode) = NULL;
1041 }
1042 }
1043 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1044 return obj;
1045 }
1046
1047 static int
1048 _PyUnicode_Dirty(PyObject *unicode)
1049 {
1050 assert(_PyUnicode_CHECK(unicode));
1051 if (Py_REFCNT(unicode) != 1) {
1052 PyErr_SetString(PyExc_SystemError,
1053 "Cannot modify a string having more than 1 reference");
1054 return -1;
1055 }
1056 _PyUnicode_DIRTY(unicode);
1057 return 0;
1058 }
1059
1060 static int
1061 _copy_characters(PyObject *to, Py_ssize_t to_start,
1062 PyObject *from, Py_ssize_t from_start,
1063 Py_ssize_t how_many, int check_maxchar)
1064 {
1065 unsigned int from_kind, to_kind;
1066 void *from_data, *to_data;
1067 int fast;
1068
1069 assert(PyUnicode_Check(from));
1070 assert(PyUnicode_Check(to));
1071 assert(PyUnicode_IS_READY(from));
1072 assert(PyUnicode_IS_READY(to));
1073
1074 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1075 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1076 assert(0 <= how_many);
1077
1078 if (how_many == 0)
1079 return 0;
1080
1081 from_kind = PyUnicode_KIND(from);
1082 from_data = PyUnicode_DATA(from);
1083 to_kind = PyUnicode_KIND(to);
1084 to_data = PyUnicode_DATA(to);
1085
1086 #ifdef Py_DEBUG
1087 if (!check_maxchar
1088 && (from_kind > to_kind
1089 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
1090 {
1091 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1092 Py_UCS4 ch;
1093 Py_ssize_t i;
1094 for (i=0; i < how_many; i++) {
1095 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1096 assert(ch <= to_maxchar);
1097 }
1098 }
1099 #endif
1100 fast = (from_kind == to_kind);
1101 if (check_maxchar
1102 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1103 {
1104 /* deny latin1 => ascii */
1105 fast = 0;
1106 }
1107
1108 if (fast) {
1109 Py_MEMCPY((char*)to_data + to_kind * to_start,
1110 (char*)from_data + from_kind * from_start,
1111 to_kind * how_many);
1112 }
1113 else if (from_kind == PyUnicode_1BYTE_KIND
1114 && to_kind == PyUnicode_2BYTE_KIND)
1115 {
1116 _PyUnicode_CONVERT_BYTES(
1117 Py_UCS1, Py_UCS2,
1118 PyUnicode_1BYTE_DATA(from) + from_start,
1119 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1120 PyUnicode_2BYTE_DATA(to) + to_start
1121 );
1122 }
1123 else if (from_kind == PyUnicode_1BYTE_KIND
1124 && to_kind == PyUnicode_4BYTE_KIND)
1125 {
1126 _PyUnicode_CONVERT_BYTES(
1127 Py_UCS1, Py_UCS4,
1128 PyUnicode_1BYTE_DATA(from) + from_start,
1129 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1130 PyUnicode_4BYTE_DATA(to) + to_start
1131 );
1132 }
1133 else if (from_kind == PyUnicode_2BYTE_KIND
1134 && to_kind == PyUnicode_4BYTE_KIND)
1135 {
1136 _PyUnicode_CONVERT_BYTES(
1137 Py_UCS2, Py_UCS4,
1138 PyUnicode_2BYTE_DATA(from) + from_start,
1139 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1140 PyUnicode_4BYTE_DATA(to) + to_start
1141 );
1142 }
1143 else {
1144 /* check if max_char(from substring) <= max_char(to) */
1145 if (from_kind > to_kind
1146 /* latin1 => ascii */
1147 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1148 {
1149 /* slow path to check for character overflow */
1150 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1151 Py_UCS4 ch;
1152 Py_ssize_t i;
1153
1154 #ifdef Py_DEBUG
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1159 }
1160 #else
1161 if (!check_maxchar) {
1162 for (i=0; i < how_many; i++) {
1163 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1164 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1165 }
1166 }
1167 else {
1168 for (i=0; i < how_many; i++) {
1169 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1170 if (ch > to_maxchar)
1171 return 1;
1172 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1173 }
1174 }
1175 #endif
1176 }
1177 else {
1178 assert(0 && "inconsistent state");
1179 return 1;
1180 }
1181 }
1182 return 0;
1183 }
1184
1185 static void
1186 copy_characters(PyObject *to, Py_ssize_t to_start,
1187 PyObject *from, Py_ssize_t from_start,
1188 Py_ssize_t how_many)
1189 {
1190 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1191 }
1192
1193 Py_ssize_t
1194 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1195 PyObject *from, Py_ssize_t from_start,
1196 Py_ssize_t how_many)
1197 {
1198 int err;
1199
1200 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1201 PyErr_BadInternalCall();
1202 return -1;
1203 }
1204
1205 if (PyUnicode_READY(from))
1206 return -1;
1207 if (PyUnicode_READY(to))
1208 return -1;
1209
1210 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1211 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1212 PyErr_Format(PyExc_SystemError,
1213 "Cannot write %zi characters at %zi "
1214 "in a string of %zi characters",
1215 how_many, to_start, PyUnicode_GET_LENGTH(to));
1216 return -1;
1217 }
1218
1219 if (how_many == 0)
1220 return 0;
1221
1222 if (_PyUnicode_Dirty(to))
1223 return -1;
1224
1225 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1226 if (err) {
1227 PyErr_Format(PyExc_SystemError,
1228 "Cannot copy %s characters "
1229 "into a string of %s characters",
1230 unicode_kind_name(from),
1231 unicode_kind_name(to));
1232 return -1;
1233 }
1234 return how_many;
1235 }
1236
1237 /* Find the maximum code point and count the number of surrogate pairs so a
1238 correct string length can be computed before converting a string to UCS4.
1239 This function counts single surrogates as a character and not as a pair.
1240
1241 Return 0 on success, or -1 on error. */
1242 static int
1243 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1244 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1245 {
1246 const wchar_t *iter;
1247 Py_UCS4 ch;
1248
1249 assert(num_surrogates != NULL && maxchar != NULL);
1250 *num_surrogates = 0;
1251 *maxchar = 0;
1252
1253 for (iter = begin; iter < end; ) {
1254 {
1255 ch = *iter;
1256 iter++;
1257 }
1258 if (ch > *maxchar) {
1259 *maxchar = ch;
1260 if (*maxchar > MAX_UNICODE) {
1261 PyErr_Format(PyExc_ValueError,
1262 "character U+%x is not in range [U+0000; U+10ffff]",
1263 ch);
1264 return -1;
1265 }
1266 }
1267 }
1268 return 0;
1269 }
1270
1271 int
1272 _PyUnicode_Ready(PyObject *unicode)
1273 {
1274 wchar_t *end;
1275 Py_UCS4 maxchar = 0;
1276 Py_ssize_t num_surrogates;
1277
1278 /* _PyUnicode_Ready() is only intended for old-style API usage where
1279 strings were created using _PyObject_New() and where no canonical
1280 representation (the str field) has been set yet aka strings
1281 which are not yet ready. */
1282 assert(_PyUnicode_CHECK(unicode));
1283 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1284 assert(_PyUnicode_WSTR(unicode) != NULL);
1285 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1286 assert(_PyUnicode_UTF8(unicode) == NULL);
1287 /* Actually, it should neither be interned nor be anything else: */
1288 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1289
1290 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1291 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1292 &maxchar, &num_surrogates) == -1)
1293 return -1;
1294
1295 if (maxchar < 256) {
1296 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1297 if (!_PyUnicode_DATA_ANY(unicode)) {
1298 PyErr_NoMemory();
1299 return -1;
1300 }
1301 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1302 _PyUnicode_WSTR(unicode), end,
1303 PyUnicode_1BYTE_DATA(unicode));
1304 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1305 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1306 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1307 if (maxchar < 128) {
1308 _PyUnicode_STATE(unicode).ascii = 1;
1309 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1310 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1311 }
1312 else {
1313 _PyUnicode_STATE(unicode).ascii = 0;
1314 _PyUnicode_UTF8(unicode) = NULL;
1315 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1316 }
1317 PyObject_FREE(_PyUnicode_WSTR(unicode));
1318 _PyUnicode_WSTR(unicode) = NULL;
1319 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1320 }
1321 /* In this case we might have to convert down from 4-byte native
1322 wchar_t to 2-byte unicode. */
1323 else if (maxchar < 65536) {
1324 assert(num_surrogates == 0 &&
1325 "FindMaxCharAndNumSurrogatePairs() messed up");
1326
1327 /* sizeof(wchar_t) == 4 */
1328 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1329 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1330 if (!_PyUnicode_DATA_ANY(unicode)) {
1331 PyErr_NoMemory();
1332 return -1;
1333 }
1334 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1335 _PyUnicode_WSTR(unicode), end,
1336 PyUnicode_2BYTE_DATA(unicode));
1337 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1338 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1339 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1340 _PyUnicode_UTF8(unicode) = NULL;
1341 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1342 PyObject_FREE(_PyUnicode_WSTR(unicode));
1343 _PyUnicode_WSTR(unicode) = NULL;
1344 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1345 }
1346 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1347 else {
1348 assert(num_surrogates == 0);
1349
1350 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1351 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1352 _PyUnicode_UTF8(unicode) = NULL;
1353 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1354 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1355 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1356 }
1357 _PyUnicode_STATE(unicode).ready = 1;
1358 assert(_PyUnicode_CheckConsistency(unicode, 1));
1359 return 0;
1360 }
1361
1362 static void
1363 unicode_dealloc(register PyObject *unicode)
1364 {
1365 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1366 case SSTATE_NOT_INTERNED:
1367 break;
1368
1369 case SSTATE_INTERNED_MORTAL:
1370 /* revive dead object temporarily for DelItem */
1371 Py_REFCNT(unicode) = 3;
1372 if (PyDict_DelItem(interned, unicode) != 0)
1373 Py_FatalError(
1374 "deletion of interned string failed");
1375 break;
1376
1377 case SSTATE_INTERNED_IMMORTAL:
1378 Py_FatalError("Immortal interned string died.");
1379
1380 default:
1381 Py_FatalError("Inconsistent interned string state.");
1382 }
1383
1384 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1385 PyObject_DEL(_PyUnicode_WSTR(unicode));
1386 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1387 PyObject_DEL(_PyUnicode_UTF8(unicode));
1388
1389 if (PyUnicode_IS_COMPACT(unicode)) {
1390 Py_TYPE(unicode)->tp_free(unicode);
1391 }
1392 else {
1393 if (_PyUnicode_DATA_ANY(unicode))
1394 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1395 Py_TYPE(unicode)->tp_free(unicode);
1396 }
1397 }
1398
1399 #ifdef Py_DEBUG
1400 static int
1401 unicode_is_singleton(PyObject *unicode)
1402 {
1403 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1404 if (unicode == unicode_empty)
1405 return 1;
1406 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1407 {
1408 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1409 if (ch < 256 && unicode_latin1[ch] == unicode)
1410 return 1;
1411 }
1412 return 0;
1413 }
1414 #endif
1415
1416 static int
1417 unicode_resizable(PyObject *unicode)
1418 {
1419 if (Py_REFCNT(unicode) != 1)
1420 return 0;
1421 if (PyUnicode_CHECK_INTERNED(unicode))
1422 return 0;
1423 #ifdef Py_DEBUG
1424 /* singleton refcount is greater than 1 */
1425 assert(!unicode_is_singleton(unicode));
1426 #endif
1427 return 1;
1428 }
1429
1430 static int
1431 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1432 {
1433 PyObject *unicode;
1434 Py_ssize_t old_length;
1435
1436 assert(p_unicode != NULL);
1437 unicode = *p_unicode;
1438
1439 assert(unicode != NULL);
1440 assert(PyUnicode_Check(unicode));
1441 assert(0 <= length);
1442
1443 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1444 old_length = PyUnicode_WSTR_LENGTH(unicode);
1445 else
1446 old_length = PyUnicode_GET_LENGTH(unicode);
1447 if (old_length == length)
1448 return 0;
1449
1450 if (length == 0) {
1451 Py_DECREF(*p_unicode);
1452 *p_unicode = unicode_empty;
1453 Py_INCREF(*p_unicode);
1454 return 0;
1455 }
1456
1457 if (!unicode_resizable(unicode)) {
1458 PyObject *copy = resize_copy(unicode, length);
1459 if (copy == NULL)
1460 return -1;
1461 Py_DECREF(*p_unicode);
1462 *p_unicode = copy;
1463 return 0;
1464 }
1465
1466 if (PyUnicode_IS_COMPACT(unicode)) {
1467 *p_unicode = resize_compact(unicode, length);
1468 if (*p_unicode == NULL)
1469 return -1;
1470 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
1471 return 0;
1472 }
1473 return resize_inplace(unicode, length);
1474 }
1475
1476 int
1477 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1478 {
1479 PyObject *unicode;
1480 if (p_unicode == NULL) {
1481 PyErr_BadInternalCall();
1482 return -1;
1483 }
1484 unicode = *p_unicode;
1485 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1486 {
1487 PyErr_BadInternalCall();
1488 return -1;
1489 }
1490 return unicode_resize(p_unicode, length);
1491 }
1492
1493 static int
1494 unicode_widen(PyObject **p_unicode, unsigned int maxchar)
1495 {
1496 PyObject *result;
1497 assert(PyUnicode_IS_READY(*p_unicode));
1498 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1499 return 0;
1500 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1501 maxchar);
1502 if (result == NULL)
1503 return -1;
1504 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1505 PyUnicode_GET_LENGTH(*p_unicode));
1506 Py_DECREF(*p_unicode);
1507 *p_unicode = result;
1508 return 0;
1509 }
1510
1511 static int
1512 unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1513 Py_UCS4 ch)
1514 {
1515 if (unicode_widen(p_unicode, ch) < 0)
1516 return -1;
1517 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1518 PyUnicode_DATA(*p_unicode),
1519 (*pos)++, ch);
1520 return 0;
1521 }
1522
1523 static PyObject*
1524 get_latin1_char(unsigned char ch)
1525 {
1526 PyObject *unicode = unicode_latin1[ch];
1527 if (!unicode) {
1528 unicode = PyUnicode_New(1, ch);
1529 if (!unicode)
1530 return NULL;
1531 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1532 assert(_PyUnicode_CheckConsistency(unicode, 1));
1533 unicode_latin1[ch] = unicode;
1534 }
1535 Py_INCREF(unicode);
1536 return unicode;
1537 }
1538
1539 PyObject *
1540 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1541 {
1542 PyObject *unicode;
1543 Py_UCS4 maxchar = 0;
1544 Py_ssize_t num_surrogates;
1545
1546 if (u == NULL)
1547 return (PyObject*)_PyUnicode_New(size);
1548
1549 /* If the Unicode data is known at construction time, we can apply
1550 some optimizations which share commonly used objects. */
1551
1552 /* Optimization for empty strings */
1553 if (size == 0 && unicode_empty != NULL) {
1554 Py_INCREF(unicode_empty);
1555 return unicode_empty;
1556 }
1557
1558 /* Single character Unicode objects in the Latin-1 range are
1559 shared when using this constructor */
1560 if (size == 1 && *u < 256)
1561 return get_latin1_char((unsigned char)*u);
1562
1563 /* If not empty and not single character, copy the Unicode data
1564 into the new object */
1565 if (find_maxchar_surrogates(u, u + size,
1566 &maxchar, &num_surrogates) == -1)
1567 return NULL;
1568
1569 unicode = PyUnicode_New(size - num_surrogates, maxchar);
1570 if (!unicode)
1571 return NULL;
1572
1573 switch (PyUnicode_KIND(unicode)) {
1574 case PyUnicode_1BYTE_KIND:
1575 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1576 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1577 break;
1578 case PyUnicode_2BYTE_KIND:
1579 #if Py_UNICODE_SIZE == 2
1580 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1581 #else
1582 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1583 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1584 #endif
1585 break;
1586 case PyUnicode_4BYTE_KIND:
1587 #if SIZEOF_WCHAR_T == 2
1588 /* This is the only case which has to process surrogates, thus
1589 a simple copy loop is not enough and we need a function. */
1590 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1591 #else
1592 assert(num_surrogates == 0);
1593 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1594 #endif
1595 break;
1596 default:
1597 assert(0 && "Impossible state");
1598 }
1599
1600 return unicode_result(unicode);
1601 }
1602
1603 PyObject *
1604 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1605 {
1606 if (size < 0) {
1607 PyErr_SetString(PyExc_SystemError,
1608 "Negative size passed to PyUnicode_FromStringAndSize");
1609 return NULL;
1610 }
1611
1612 /* If the Unicode data is known at construction time, we can apply
1613 some optimizations which share commonly used objects.
1614 Also, this means the input must be UTF-8, so fall back to the
1615 UTF-8 decoder at the end. */
1616 if (u != NULL) {
1617
1618 /* Optimization for empty strings */
1619 if (size == 0 && unicode_empty != NULL) {
1620 Py_INCREF(unicode_empty);
1621 return unicode_empty;
1622 }
1623
1624 /* Single characters are shared when using this constructor.
1625 Restrict to ASCII, since the input must be UTF-8. */
1626 if (size == 1 && (unsigned char)*u < 128)
1627 return get_latin1_char((unsigned char)*u);
1628
1629 return PyUnicode_DecodeUTF8(u, size, NULL);
1630 }
1631
1632 return (PyObject *)_PyUnicode_New(size);
1633 }
1634
1635 PyObject *
1636 PyUnicode_FromString(const char *u)
1637 {
1638 size_t size = strlen(u);
1639 if (size > PY_SSIZE_T_MAX) {
1640 PyErr_SetString(PyExc_OverflowError, "input too long");
1641 return NULL;
1642 }
1643
1644 return PyUnicode_FromStringAndSize(u, size);
1645 }
1646
1647 PyObject *
1648 _PyUnicode_FromId(_Py_Identifier *id)
1649 {
1650 if (!id->object) {
1651 id->object = PyUnicode_FromString(id->string);
1652 if (!id->object)
1653 return NULL;
1654 PyUnicode_InternInPlace(&id->object);
1655 assert(!id->next);
1656 id->next = static_strings;
1657 static_strings = id;
1658 }
1659 return id->object;
1660 }
1661
1662 void
1663 _PyUnicode_ClearStaticStrings()
1664 {
1665 _Py_Identifier *i;
1666 for (i = static_strings; i; i = i->next) {
1667 Py_DECREF(i->object);
1668 i->object = NULL;
1669 i->next = NULL;
1670 }
1671 }
1672
1673 /* Internal function, don't check maximum character */
1674
1675 static PyObject*
1676 unicode_fromascii(const unsigned char* s, Py_ssize_t size)
1677 {
1678 PyObject *res;
1679 #ifdef Py_DEBUG
1680 const unsigned char *p;
1681 const unsigned char *end = s + size;
1682 for (p=s; p < end; p++) {
1683 assert(*p < 128);
1684 }
1685 #endif
1686 if (size == 1)
1687 return get_latin1_char(s[0]);
1688 res = PyUnicode_New(size, 127);
1689 if (!res)
1690 return NULL;
1691 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
1692 return res;
1693 }
1694
1695 static Py_UCS4
1696 kind_maxchar_limit(unsigned int kind)
1697 {
1698 switch(kind) {
1699 case PyUnicode_1BYTE_KIND:
1700 return 0x80;
1701 case PyUnicode_2BYTE_KIND:
1702 return 0x100;
1703 case PyUnicode_4BYTE_KIND:
1704 return 0x10000;
1705 default:
1706 assert(0 && "invalid kind");
1707 return MAX_UNICODE;
1708 }
1709 }
1710
1711 static PyObject*
1712 _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1713 {
1714 PyObject *res;
1715 unsigned char max_char;
1716
1717 if (size == 0) {
1718 Py_INCREF(unicode_empty);
1719 return unicode_empty;
1720 }
1721 assert(size > 0);
1722 if (size == 1)
1723 return get_latin1_char(u[0]);
1724
1725 max_char = ucs1lib_find_max_char(u, u + size);
1726 res = PyUnicode_New(size, max_char);
1727 if (!res)
1728 return NULL;
1729 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1730 assert(_PyUnicode_CheckConsistency(res, 1));
1731 return res;
1732 }
1733
1734 static PyObject*
1735 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1736 {
1737 PyObject *res;
1738 Py_UCS2 max_char;
1739
1740 if (size == 0) {
1741 Py_INCREF(unicode_empty);
1742 return unicode_empty;
1743 }
1744 assert(size > 0);
1745 if (size == 1 && u[0] < 256)
1746 return get_latin1_char((unsigned char)u[0]);
1747
1748 max_char = ucs2lib_find_max_char(u, u + size);
1749 res = PyUnicode_New(size, max_char);
1750 if (!res)
1751 return NULL;
1752 if (max_char >= 256)
1753 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1754 else {
1755 _PyUnicode_CONVERT_BYTES(
1756 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1757 }
1758 assert(_PyUnicode_CheckConsistency(res, 1));
1759 return res;
1760 }
1761
1762 static PyObject*
1763 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1764 {
1765 PyObject *res;
1766 Py_UCS4 max_char;
1767
1768 if (size == 0) {
1769 Py_INCREF(unicode_empty);
1770 return unicode_empty;
1771 }
1772 assert(size > 0);
1773 if (size == 1 && u[0] < 256)
1774 return get_latin1_char((unsigned char)u[0]);
1775
1776 max_char = ucs4lib_find_max_char(u, u + size);
1777 res = PyUnicode_New(size, max_char);
1778 if (!res)
1779 return NULL;
1780 if (max_char < 256)
1781 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1782 PyUnicode_1BYTE_DATA(res));
1783 else if (max_char < 0x10000)
1784 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1785 PyUnicode_2BYTE_DATA(res));
1786 else
1787 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1788 assert(_PyUnicode_CheckConsistency(res, 1));
1789 return res;
1790 }
1791
1792 PyObject*
1793 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1794 {
1795 if (size < 0) {
1796 PyErr_SetString(PyExc_ValueError, "size must be positive");
1797 return NULL;
1798 }
1799 switch(kind) {
1800 case PyUnicode_1BYTE_KIND:
1801 return _PyUnicode_FromUCS1(buffer, size);
1802 case PyUnicode_2BYTE_KIND:
1803 return _PyUnicode_FromUCS2(buffer, size);
1804 case PyUnicode_4BYTE_KIND:
1805 return _PyUnicode_FromUCS4(buffer, size);
1806 default:
1807 PyErr_SetString(PyExc_SystemError, "invalid kind");
1808 return NULL;
1809 }
1810 }
1811
1812 /* Ensure that a string uses the most efficient storage, if it is not the
1813 case: create a new string with of the right kind. Write NULL into *p_unicode
1814 on error. */
1815 static void
1816 unicode_adjust_maxchar(PyObject **p_unicode)
1817 {
1818 PyObject *unicode, *copy;
1819 Py_UCS4 max_char;
1820 Py_ssize_t len;
1821 unsigned int kind;
1822
1823 assert(p_unicode != NULL);
1824 unicode = *p_unicode;
1825 assert(PyUnicode_IS_READY(unicode));
1826 if (PyUnicode_IS_ASCII(unicode))
1827 return;
1828
1829 len = PyUnicode_GET_LENGTH(unicode);
1830 kind = PyUnicode_KIND(unicode);
1831 if (kind == PyUnicode_1BYTE_KIND) {
1832 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1833 max_char = ucs1lib_find_max_char(u, u + len);
1834 if (max_char >= 128)
1835 return;
1836 }
1837 else if (kind == PyUnicode_2BYTE_KIND) {
1838 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1839 max_char = ucs2lib_find_max_char(u, u + len);
1840 if (max_char >= 256)
1841 return;
1842 }
1843 else {
1844 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
1845 assert(kind == PyUnicode_4BYTE_KIND);
1846 max_char = ucs4lib_find_max_char(u, u + len);
1847 if (max_char >= 0x10000)
1848 return;
1849 }
1850 copy = PyUnicode_New(len, max_char);
1851 copy_characters(copy, 0, unicode, 0, len);
1852 Py_DECREF(unicode);
1853 *p_unicode = copy;
1854 }
1855
1856 PyObject*
1857 PyUnicode_Copy(PyObject *unicode)
1858 {
1859 Py_ssize_t length;
1860 PyObject *copy;
1861
1862 if (!PyUnicode_Check(unicode)) {
1863 PyErr_BadInternalCall();
1864 return NULL;
1865 }
1866 if (PyUnicode_READY(unicode))
1867 return NULL;
1868
1869 length = PyUnicode_GET_LENGTH(unicode);
1870 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1871 if (!copy)
1872 return NULL;
1873 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1874
1875 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1876 length * PyUnicode_KIND(unicode));
1877 assert(_PyUnicode_CheckConsistency(copy, 1));
1878 return copy;
1879 }
1880
1881
1882 /* Widen Unicode objects to larger buffers. Don't write terminating null
1883 character. Return NULL on error. */
1884
1885 void*
1886 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
1887 {
1888 Py_ssize_t len;
1889 void *result;
1890 unsigned int skind;
1891
1892 if (PyUnicode_READY(s))
1893 return NULL;
1894
1895 len = PyUnicode_GET_LENGTH(s);
1896 skind = PyUnicode_KIND(s);
1897 if (skind >= kind) {
1898 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
1899 return NULL;
1900 }
1901 switch(kind) {
1902 case PyUnicode_2BYTE_KIND:
1903 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1904 if (!result)
1905 return PyErr_NoMemory();
1906 assert(skind == PyUnicode_1BYTE_KIND);
1907 _PyUnicode_CONVERT_BYTES(
1908 Py_UCS1, Py_UCS2,
1909 PyUnicode_1BYTE_DATA(s),
1910 PyUnicode_1BYTE_DATA(s) + len,
1911 result);
1912 return result;
1913 case PyUnicode_4BYTE_KIND:
1914 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1915 if (!result)
1916 return PyErr_NoMemory();
1917 if (skind == PyUnicode_2BYTE_KIND) {
1918 _PyUnicode_CONVERT_BYTES(
1919 Py_UCS2, Py_UCS4,
1920 PyUnicode_2BYTE_DATA(s),
1921 PyUnicode_2BYTE_DATA(s) + len,
1922 result);
1923 }
1924 else {
1925 assert(skind == PyUnicode_1BYTE_KIND);
1926 _PyUnicode_CONVERT_BYTES(
1927 Py_UCS1, Py_UCS4,
1928 PyUnicode_1BYTE_DATA(s),
1929 PyUnicode_1BYTE_DATA(s) + len,
1930 result);
1931 }
1932 return result;
1933 default:
1934 break;
1935 }
1936 PyErr_SetString(PyExc_SystemError, "invalid kind");
1937 return NULL;
1938 }
1939
1940 static Py_UCS4*
1941 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1942 int copy_null)
1943 {
1944 int kind;
1945 void *data;
1946 Py_ssize_t len, targetlen;
1947 if (PyUnicode_READY(string) == -1)
1948 return NULL;
1949 kind = PyUnicode_KIND(string);
1950 data = PyUnicode_DATA(string);
1951 len = PyUnicode_GET_LENGTH(string);
1952 targetlen = len;
1953 if (copy_null)
1954 targetlen++;
1955 if (!target) {
1956 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1957 PyErr_NoMemory();
1958 return NULL;
1959 }
1960 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1961 if (!target) {
1962 PyErr_NoMemory();
1963 return NULL;
1964 }
1965 }
1966 else {
1967 if (targetsize < targetlen) {
1968 PyErr_Format(PyExc_SystemError,
1969 "string is longer than the buffer");
1970 if (copy_null && 0 < targetsize)
1971 target[0] = 0;
1972 return NULL;
1973 }
1974 }
1975 if (kind == PyUnicode_1BYTE_KIND) {
1976 Py_UCS1 *start = (Py_UCS1 *) data;
1977 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
1978 }
1979 else if (kind == PyUnicode_2BYTE_KIND) {
1980 Py_UCS2 *start = (Py_UCS2 *) data;
1981 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
1982 }
1983 else {
1984 assert(kind == PyUnicode_4BYTE_KIND);
1985 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1986 }
1987 if (copy_null)
1988 target[len] = 0;
1989 return target;
1990 }
1991
1992 Py_UCS4*
1993 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1994 int copy_null)
1995 {
1996 if (target == NULL || targetsize < 0) {
1997 PyErr_BadInternalCall();
1998 return NULL;
1999 }
2000 return as_ucs4(string, target, targetsize, copy_null);
2001 }
2002
2003 Py_UCS4*
2004 PyUnicode_AsUCS4Copy(PyObject *string)
2005 {
2006 return as_ucs4(string, NULL, 0, 1);
2007 }
2008
2009 #ifdef HAVE_WCHAR_H
2010
2011 PyObject *
2012 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2013 {
2014 if (w == NULL) {
2015 if (size == 0)
2016 return PyUnicode_New(0, 0);
2017 PyErr_BadInternalCall();
2018 return NULL;
2019 }
2020
2021 if (size == -1) {
2022 size = wcslen(w);
2023 }
2024
2025 return PyUnicode_FromUnicode(w, size);
2026 }
2027
2028 #endif /* HAVE_WCHAR_H */
2029
2030 static void
2031 makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2032 int zeropad, int width, int precision, char c)
2033 {
2034 *fmt++ = '%';
2035 if (width) {
2036 if (zeropad)
2037 *fmt++ = '0';
2038 fmt += sprintf(fmt, "%d", width);
2039 }
2040 if (precision)
2041 fmt += sprintf(fmt, ".%d", precision);
2042 if (longflag)
2043 *fmt++ = 'l';
2044 else if (longlongflag) {
2045 /* longlongflag should only ever be nonzero on machines with
2046 HAVE_LONG_LONG defined */
2047 #ifdef HAVE_LONG_LONG
2048 char *f = PY_FORMAT_LONG_LONG;
2049 while (*f)
2050 *fmt++ = *f++;
2051 #else
2052 /* we shouldn't ever get here */
2053 assert(0);
2054 *fmt++ = 'l';
2055 #endif
2056 }
2057 else if (size_tflag) {
2058 char *f = PY_FORMAT_SIZE_T;
2059 while (*f)
2060 *fmt++ = *f++;
2061 }
2062 *fmt++ = c;
2063 *fmt = '\0';
2064 }
2065
2066 /* helper for PyUnicode_FromFormatV() */
2067
2068 static const char*
2069 parse_format_flags(const char *f,
2070 int *p_width, int *p_precision,
2071 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2072 {
2073 int width, precision, longflag, longlongflag, size_tflag;
2074
2075 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2076 f++;
2077 width = 0;
2078 while (Py_ISDIGIT((unsigned)*f))
2079 width = (width*10) + *f++ - '0';
2080 precision = 0;
2081 if (*f == '.') {
2082 f++;
2083 while (Py_ISDIGIT((unsigned)*f))
2084 precision = (precision*10) + *f++ - '0';
2085 if (*f == '%') {
2086 /* "%.3%s" => f points to "3" */
2087 f--;
2088 }
2089 }
2090 if (*f == '\0') {
2091 /* bogus format "%.1" => go backward, f points to "1" */
2092 f--;
2093 }
2094 if (p_width != NULL)
2095 *p_width = width;
2096 if (p_precision != NULL)
2097 *p_precision = precision;
2098
2099 /* Handle %ld, %lu, %lld and %llu. */
2100 longflag = 0;
2101 longlongflag = 0;
2102 size_tflag = 0;
2103
2104 if (*f == 'l') {
2105 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2106 longflag = 1;
2107 ++f;
2108 }
2109 #ifdef HAVE_LONG_LONG
2110 else if (f[1] == 'l' &&
2111 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2112 longlongflag = 1;
2113 f += 2;
2114 }
2115 #endif
2116 }
2117 /* handle the size_t flag. */
2118 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2119 size_tflag = 1;
2120 ++f;
2121 }
2122 if (p_longflag != NULL)
2123 *p_longflag = longflag;
2124 if (p_longlongflag != NULL)
2125 *p_longlongflag = longlongflag;
2126 if (p_size_tflag != NULL)
2127 *p_size_tflag = size_tflag;
2128 return f;
2129 }
2130
2131 /* maximum number of characters required for output of %ld. 21 characters
2132 allows for 64-bit integers (in decimal) and an optional sign. */
2133 #define MAX_LONG_CHARS 21
2134 /* maximum number of characters required for output of %lld.
2135 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2136 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2137 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2138
2139 PyObject *
2140 PyUnicode_FromFormatV(const char *format, va_list vargs)
2141 {
2142 va_list count;
2143 Py_ssize_t callcount = 0;
2144 PyObject **callresults = NULL;
2145 PyObject **callresult = NULL;
2146 Py_ssize_t n = 0;
2147 int width = 0;
2148 int precision = 0;
2149 int zeropad;
2150 const char* f;
2151 PyObject *string;
2152 /* used by sprintf */
2153 char fmt[61]; /* should be enough for %0width.precisionlld */
2154 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2155 Py_UCS4 argmaxchar;
2156 Py_ssize_t numbersize = 0;
2157 char *numberresults = NULL;
2158 char *numberresult = NULL;
2159 Py_ssize_t i;
2160 int kind;
2161 void *data;
2162
2163 Py_VA_COPY(count, vargs);
2164 /* step 1: count the number of %S/%R/%A/%s format specifications
2165 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2166 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2167 * result in an array)
2168 * also estimate a upper bound for all the number formats in the string,
2169 * numbers will be formatted in step 3 and be kept in a '\0'-separated
2170 * buffer before putting everything together. */
2171 for (f = format; *f; f++) {
2172 if (*f == '%') {
2173 int longlongflag;
2174 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2175 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2176 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2177 ++callcount;
2178
2179 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2180 #ifdef HAVE_LONG_LONG
2181 if (longlongflag) {
2182 if (width < MAX_LONG_LONG_CHARS)
2183 width = MAX_LONG_LONG_CHARS;
2184 }
2185 else
2186 #endif
2187 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2188 including sign. Decimal takes the most space. This
2189 isn't enough for octal. If a width is specified we
2190 need more (which we allocate later). */
2191 if (width < MAX_LONG_CHARS)
2192 width = MAX_LONG_CHARS;
2193
2194 /* account for the size + '\0' to separate numbers
2195 inside of the numberresults buffer */
2196 numbersize += (width + 1);
2197 }
2198 }
2199 else if ((unsigned char)*f > 127) {
2200 PyErr_Format(PyExc_ValueError,
2201 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2202 "string, got a non-ASCII byte: 0x%02x",
2203 (unsigned char)*f);
2204 return NULL;
2205 }
2206 }
2207 /* step 2: allocate memory for the results of
2208 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2209 if (callcount) {
2210 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2211 if (!callresults) {
2212 PyErr_NoMemory();
2213 return NULL;
2214 }
2215 callresult = callresults;
2216 }
2217 /* step 2.5: allocate memory for the results of formating numbers */
2218 if (numbersize) {
2219 numberresults = PyObject_Malloc(numbersize);
2220 if (!numberresults) {
2221 PyErr_NoMemory();
2222 goto fail;
2223 }
2224 numberresult = numberresults;
2225 }
2226
2227 /* step 3: format numbers and figure out how large a buffer we need */
2228 for (f = format; *f; f++) {
2229 if (*f == '%') {
2230 const char* p;
2231 int longflag;
2232 int longlongflag;
2233 int size_tflag;
2234 int numprinted;
2235
2236 p = f;
2237 zeropad = (f[1] == '0');
2238 f = parse_format_flags(f, &width, &precision,
2239 &longflag, &longlongflag, &size_tflag);
2240 switch (*f) {
2241 case 'c':
2242 {
2243 Py_UCS4 ordinal = va_arg(count, int);
2244 maxchar = Py_MAX(maxchar, ordinal);
2245 n++;
2246 break;
2247 }
2248 case '%':
2249 n++;
2250 break;
2251 case 'i':
2252 case 'd':
2253 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2254 width, precision, *f);
2255 if (longflag)
2256 numprinted = sprintf(numberresult, fmt,
2257 va_arg(count, long));
2258 #ifdef HAVE_LONG_LONG
2259 else if (longlongflag)
2260 numprinted = sprintf(numberresult, fmt,
2261 va_arg(count, PY_LONG_LONG));
2262 #endif
2263 else if (size_tflag)
2264 numprinted = sprintf(numberresult, fmt,
2265 va_arg(count, Py_ssize_t));
2266 else
2267 numprinted = sprintf(numberresult, fmt,
2268 va_arg(count, int));
2269 n += numprinted;
2270 /* advance by +1 to skip over the '\0' */
2271 numberresult += (numprinted + 1);
2272 assert(*(numberresult - 1) == '\0');
2273 assert(*(numberresult - 2) != '\0');
2274 assert(numprinted >= 0);
2275 assert(numberresult <= numberresults + numbersize);
2276 break;
2277 case 'u':
2278 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2279 width, precision, 'u');
2280 if (longflag)
2281 numprinted = sprintf(numberresult, fmt,
2282 va_arg(count, unsigned long));
2283 #ifdef HAVE_LONG_LONG
2284 else if (longlongflag)
2285 numprinted = sprintf(numberresult, fmt,
2286 va_arg(count, unsigned PY_LONG_LONG));
2287 #endif
2288 else if (size_tflag)
2289 numprinted = sprintf(numberresult, fmt,
2290 va_arg(count, size_t));
2291 else
2292 numprinted = sprintf(numberresult, fmt,
2293 va_arg(count, unsigned int));
2294 n += numprinted;
2295 numberresult += (numprinted + 1);
2296 assert(*(numberresult - 1) == '\0');
2297 assert(*(numberresult - 2) != '\0');
2298 assert(numprinted >= 0);
2299 assert(numberresult <= numberresults + numbersize);
2300 break;
2301 case 'x':
2302 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2303 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2304 n += numprinted;
2305 numberresult += (numprinted + 1);
2306 assert(*(numberresult - 1) == '\0');
2307 assert(*(numberresult - 2) != '\0');
2308 assert(numprinted >= 0);
2309 assert(numberresult <= numberresults + numbersize);
2310 break;
2311 case 'p':
2312 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2313 /* %p is ill-defined: ensure leading 0x. */
2314 if (numberresult[1] == 'X')
2315 numberresult[1] = 'x';
2316 else if (numberresult[1] != 'x') {
2317 memmove(numberresult + 2, numberresult,
2318 strlen(numberresult) + 1);
2319 numberresult[0] = '0';
2320 numberresult[1] = 'x';
2321 numprinted += 2;
2322 }
2323 n += numprinted;
2324 numberresult += (numprinted + 1);
2325 assert(*(numberresult - 1) == '\0');
2326 assert(*(numberresult - 2) != '\0');
2327 assert(numprinted >= 0);
2328 assert(numberresult <= numberresults + numbersize);
2329 break;
2330 case 's':
2331 {
2332 /* UTF-8 */
2333 const char *s = va_arg(count, const char*);
2334 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2335 if (!str)
2336 goto fail;
2337 /* since PyUnicode_DecodeUTF8 returns already flexible
2338 unicode objects, there is no need to call ready on them */
2339 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2340 maxchar = Py_MAX(maxchar, argmaxchar);
2341 n += PyUnicode_GET_LENGTH(str);
2342 /* Remember the str and switch to the next slot */
2343 *callresult++ = str;
2344 break;
2345 }
2346 case 'U':
2347 {
2348 PyObject *obj = va_arg(count, PyObject *);
2349 assert(obj && _PyUnicode_CHECK(obj));
2350 if (PyUnicode_READY(obj) == -1)
2351 goto fail;
2352 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2353 maxchar = Py_MAX(maxchar, argmaxchar);
2354 n += PyUnicode_GET_LENGTH(obj);
2355 break;
2356 }
2357 case 'V':
2358 {
2359 PyObject *obj = va_arg(count, PyObject *);
2360 const char *str = va_arg(count, const char *);
2361 PyObject *str_obj;
2362 assert(obj || str);
2363 assert(!obj || _PyUnicode_CHECK(obj));
2364 if (obj) {
2365 if (PyUnicode_READY(obj) == -1)
2366 goto fail;
2367 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2368 maxchar = Py_MAX(maxchar, argmaxchar);
2369 n += PyUnicode_GET_LENGTH(obj);
2370 *callresult++ = NULL;
2371 }
2372 else {
2373 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2374 if (!str_obj)
2375 goto fail;
2376 if (PyUnicode_READY(str_obj)) {
2377 Py_DECREF(str_obj);
2378 goto fail;
2379 }
2380 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2381 maxchar = Py_MAX(maxchar, argmaxchar);
2382 n += PyUnicode_GET_LENGTH(str_obj);
2383 *callresult++ = str_obj;
2384 }
2385 break;
2386 }
2387 case 'S':
2388 {
2389 PyObject *obj = va_arg(count, PyObject *);
2390 PyObject *str;
2391 assert(obj);
2392 str = PyObject_Str(obj);
2393 if (!str || PyUnicode_READY(str) == -1)
2394 goto fail;
2395 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2396 maxchar = Py_MAX(maxchar, argmaxchar);
2397 n += PyUnicode_GET_LENGTH(str);
2398 /* Remember the str and switch to the next slot */
2399 *callresult++ = str;
2400 break;
2401 }
2402 case 'R':
2403 {
2404 PyObject *obj = va_arg(count, PyObject *);
2405 PyObject *repr;
2406 assert(obj);
2407 repr = PyObject_Repr(obj);
2408 if (!repr || PyUnicode_READY(repr) == -1)
2409 goto fail;
2410 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2411 maxchar = Py_MAX(maxchar, argmaxchar);
2412 n += PyUnicode_GET_LENGTH(repr);
2413 /* Remember the repr and switch to the next slot */
2414 *callresult++ = repr;
2415 break;
2416 }
2417 case 'A':
2418 {
2419 PyObject *obj = va_arg(count, PyObject *);
2420 PyObject *ascii;
2421 assert(obj);
2422 ascii = PyObject_ASCII(obj);
2423 if (!ascii || PyUnicode_READY(ascii) == -1)
2424 goto fail;
2425 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2426 maxchar = Py_MAX(maxchar, argmaxchar);
2427 n += PyUnicode_GET_LENGTH(ascii);
2428 /* Remember the repr and switch to the next slot */
2429 *callresult++ = ascii;
2430 break;
2431 }
2432 default:
2433 /* if we stumble upon an unknown
2434 formatting code, copy the rest of
2435 the format string to the output
2436 string. (we cannot just skip the
2437 code, since there's no way to know
2438 what's in the argument list) */
2439 n += strlen(p);
2440 goto expand;
2441 }
2442 } else
2443 n++;
2444 }
2445 expand:
2446 /* step 4: fill the buffer */
2447 /* Since we've analyzed how much space we need,
2448 we don't have to resize the string.
2449 There can be no errors beyond this point. */
2450 string = PyUnicode_New(n, maxchar);
2451 if (!string)
2452 goto fail;
2453 kind = PyUnicode_KIND(string);
2454 data = PyUnicode_DATA(string);
2455 callresult = callresults;
2456 numberresult = numberresults;
2457
2458 for (i = 0, f = format; *f; f++) {
2459 if (*f == '%') {
2460 const char* p;
2461
2462 p = f;
2463 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2464 /* checking for == because the last argument could be a empty
2465 string, which causes i to point to end, the assert at the end of
2466 the loop */
2467 assert(i <= PyUnicode_GET_LENGTH(string));
2468
2469 switch (*f) {
2470 case 'c':
2471 {
2472 const int ordinal = va_arg(vargs, int);
2473 PyUnicode_WRITE(kind, data, i++, ordinal);
2474 break;
2475 }
2476 case 'i':
2477 case 'd':
2478 case 'u':
2479 case 'x':
2480 case 'p':
2481 /* unused, since we already have the result */
2482 if (*f == 'p')
2483 (void) va_arg(vargs, void *);
2484 else
2485 (void) va_arg(vargs, int);
2486 /* extract the result from numberresults and append. */
2487 for (; *numberresult; ++i, ++numberresult)
2488 PyUnicode_WRITE(kind, data, i, *numberresult);
2489 /* skip over the separating '\0' */
2490 assert(*numberresult == '\0');
2491 numberresult++;
2492 assert(numberresult <= numberresults + numbersize);
2493 break;
2494 case 's':
2495 {
2496 /* unused, since we already have the result */
2497 Py_ssize_t size;
2498 (void) va_arg(vargs, char *);
2499 size = PyUnicode_GET_LENGTH(*callresult);
2500 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2501 copy_characters(string, i, *callresult, 0, size);
2502 i += size;
2503 /* We're done with the unicode()/repr() => forget it */
2504 Py_DECREF(*callresult);
2505 /* switch to next unicode()/repr() result */
2506 ++callresult;
2507 break;
2508 }
2509 case 'U':
2510 {
2511 PyObject *obj = va_arg(vargs, PyObject *);
2512 Py_ssize_t size;
2513 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2514 size = PyUnicode_GET_LENGTH(obj);
2515 copy_characters(string, i, obj, 0, size);
2516 i += size;
2517 break;
2518 }
2519 case 'V':
2520 {
2521 Py_ssize_t size;
2522 PyObject *obj = va_arg(vargs, PyObject *);
2523 va_arg(vargs, const char *);
2524 if (obj) {
2525 size = PyUnicode_GET_LENGTH(obj);
2526 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2527 copy_characters(string, i, obj, 0, size);
2528 i += size;
2529 } else {
2530 size = PyUnicode_GET_LENGTH(*callresult);
2531 assert(PyUnicode_KIND(*callresult) <=
2532 PyUnicode_KIND(string));
2533 copy_characters(string, i, *callresult, 0, size);
2534 i += size;
2535 Py_DECREF(*callresult);
2536 }
2537 ++callresult;
2538 break;
2539 }
2540 case 'S':
2541 case 'R':
2542 case 'A':
2543 {
2544 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2545 /* unused, since we already have the result */
2546 (void) va_arg(vargs, PyObject *);
2547 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2548 copy_characters(string, i, *callresult, 0, size);
2549 i += size;
2550 /* We're done with the unicode()/repr() => forget it */
2551 Py_DECREF(*callresult);
2552 /* switch to next unicode()/repr() result */
2553 ++callresult;
2554 break;
2555 }
2556 case '%':
2557 PyUnicode_WRITE(kind, data, i++, '%');
2558 break;
2559 default:
2560 for (; *p; ++p, ++i)
2561 PyUnicode_WRITE(kind, data, i, *p);
2562 assert(i == PyUnicode_GET_LENGTH(string));
2563 goto end;
2564 }
2565 }
2566 else {
2567 assert(i < PyUnicode_GET_LENGTH(string));
2568 PyUnicode_WRITE(kind, data, i++, *f);
2569 }
2570 }
2571 assert(i == PyUnicode_GET_LENGTH(string));
2572
2573 end:
2574 if (callresults)
2575 PyObject_Free(callresults);
2576 if (numberresults)
2577 PyObject_Free(numberresults);
2578 return unicode_result(string);
2579 fail:
2580 if (callresults) {
2581 PyObject **callresult2 = callresults;
2582 while (callresult2 < callresult) {
2583 Py_XDECREF(*callresult2);
2584 ++callresult2;
2585 }
2586 PyObject_Free(callresults);
2587 }
2588 if (numberresults)
2589 PyObject_Free(numberresults);
2590 return NULL;
2591 }
2592
2593 PyObject *
2594 PyUnicode_FromFormat(const char *format, ...)
2595 {
2596 PyObject* ret;
2597 va_list vargs;
2598
2599 #ifdef HAVE_STDARG_PROTOTYPES
2600 va_start(vargs, format);
2601 #else
2602 va_start(vargs);
2603 #endif
2604 ret = PyUnicode_FromFormatV(format, vargs);
2605 va_end(vargs);
2606 return ret;
2607 }
2608
2609 #ifdef HAVE_WCHAR_H
2610
2611 /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2612 convert a Unicode object to a wide character string.
2613
2614 - If w is NULL: return the number of wide characters (including the null
2615 character) required to convert the unicode object. Ignore size argument.
2616
2617 - Otherwise: return the number of wide characters (excluding the null
2618 character) written into w. Write at most size wide characters (including
2619 the null character). */
2620 static Py_ssize_t
2621 unicode_aswidechar(PyObject *unicode,
2622 wchar_t *w,
2623 Py_ssize_t size)
2624 {
2625 Py_ssize_t res;
2626 const wchar_t *wstr;
2627
2628 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2629 if (wstr == NULL)
2630 return -1;
2631
2632 if (w != NULL) {
2633 if (size > res)
2634 size = res + 1;
2635 else
2636 res = size;
2637 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2638 return res;
2639 }
2640 else
2641 return res + 1;
2642 }
2643
2644 Py_ssize_t
2645 PyUnicode_AsWideChar(PyObject *unicode,
2646 wchar_t *w,
2647 Py_ssize_t size)
2648 {
2649 if (unicode == NULL) {
2650 PyErr_BadInternalCall();
2651 return -1;
2652 }
2653 return unicode_aswidechar(unicode, w, size);
2654 }
2655
2656 wchar_t*
2657 PyUnicode_AsWideCharString(PyObject *unicode,
2658 Py_ssize_t *size)
2659 {
2660 wchar_t* buffer;
2661 Py_ssize_t buflen;
2662
2663 if (unicode == NULL) {
2664 PyErr_BadInternalCall();
2665 return NULL;
2666 }
2667
2668 buflen = unicode_aswidechar(unicode, NULL, 0);
2669 if (buflen == -1)
2670 return NULL;
2671 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2672 PyErr_NoMemory();
2673 return NULL;
2674 }
2675
2676 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2677 if (buffer == NULL) {
2678 PyErr_NoMemory();
2679 return NULL;
2680 }
2681 buflen = unicode_aswidechar(unicode, buffer, buflen);
2682 if (buflen == -1)
2683 return NULL;
2684 if (size != NULL)
2685 *size = buflen;
2686 return buffer;
2687 }
2688
2689 #endif /* HAVE_WCHAR_H */
2690
2691 PyObject *
2692 PyUnicode_FromOrdinal(int ordinal)
2693 {
2694 PyObject *v;
2695 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2696 PyErr_SetString(PyExc_ValueError,
2697 "chr() arg not in range(0x110000)");
2698 return NULL;
2699 }
2700
2701 if (ordinal < 256)
2702 return get_latin1_char(ordinal);
2703
2704 v = PyUnicode_New(1, ordinal);
2705 if (v == NULL)
2706 return NULL;
2707 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2708 assert(_PyUnicode_CheckConsistency(v, 1));
2709 return v;
2710 }
2711
2712 PyObject *
2713 PyUnicode_FromObject(register PyObject *obj)
2714 {
2715 /* XXX Perhaps we should make this API an alias of
2716 PyObject_Str() instead ?! */
2717 if (PyUnicode_CheckExact(obj)) {
2718 if (PyUnicode_READY(obj))
2719 return NULL;
2720 Py_INCREF(obj);
2721 return obj;
2722 }
2723 if (PyUnicode_Check(obj)) {
2724 /* For a Unicode subtype that's not a Unicode object,
2725 return a true Unicode object with the same data. */
2726 return PyUnicode_Copy(obj);
2727 }
2728 PyErr_Format(PyExc_TypeError,
2729 "Can't convert '%.100s' object to str implicitly",
2730 Py_TYPE(obj)->tp_name);
2731 return NULL;
2732 }
2733
2734 PyObject *
2735 PyUnicode_FromEncodedObject(register PyObject *obj,
2736 const char *encoding,
2737 const char *errors)
2738 {
2739 Py_buffer buffer;
2740 PyObject *v;
2741
2742 if (obj == NULL) {
2743 PyErr_BadInternalCall();
2744 return NULL;
2745 }
2746
2747 /* Decoding bytes objects is the most common case and should be fast */
2748 if (PyBytes_Check(obj)) {
2749 if (PyBytes_GET_SIZE(obj) == 0) {
2750 Py_INCREF(unicode_empty);
2751 v = unicode_empty;
2752 }
2753 else {
2754 v = PyUnicode_Decode(
2755 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2756 encoding, errors);
2757 }
2758 return v;
2759 }
2760
2761 if (PyUnicode_Check(obj)) {
2762 PyErr_SetString(PyExc_TypeError,
2763 "decoding str is not supported");
2764 return NULL;
2765 }
2766
2767 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2768 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2769 PyErr_Format(PyExc_TypeError,
2770 "coercing to str: need bytes, bytearray "
2771 "or buffer-like object, %.80s found",
2772 Py_TYPE(obj)->tp_name);
2773 return NULL;
2774 }
2775
2776 if (buffer.len == 0) {
2777 Py_INCREF(unicode_empty);
2778 v = unicode_empty;
2779 }
2780 else
2781 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2782
2783 PyBuffer_Release(&buffer);
2784 return v;
2785 }
2786
2787 /* Convert encoding to lower case and replace '_' with '-' in order to
2788 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2789 1 on success. */
2790 static int
2791 normalize_encoding(const char *encoding,
2792 char *lower,
2793 size_t lower_len)
2794 {
2795 const char *e;
2796 char *l;
2797 char *l_end;
2798
2799 if (encoding == NULL) {
2800 strcpy(lower, "utf-8");
2801 return 1;
2802 }
2803 e = encoding;
2804 l = lower;
2805 l_end = &lower[lower_len - 1];
2806 while (*e) {
2807 if (l == l_end)
2808 return 0;
2809 if (Py_ISUPPER(*e)) {
2810 *l++ = Py_TOLOWER(*e++);
2811 }
2812 else if (*e == '_') {
2813 *l++ = '-';
2814 e++;
2815 }
2816 else {
2817 *l++ = *e++;
2818 }
2819 }
2820 *l = '\0';
2821 return 1;
2822 }
2823
2824 PyObject *
2825 PyUnicode_Decode(const char *s,
2826 Py_ssize_t size,
2827 const char *encoding,
2828 const char *errors)
2829 {
2830 PyObject *buffer = NULL, *unicode;
2831 Py_buffer info;
2832 char lower[11]; /* Enough for any encoding shortcut */
2833
2834 /* Shortcuts for common default encodings */
2835 if (normalize_encoding(encoding, lower, sizeof(lower))) {
2836 if ((strcmp(lower, "utf-8") == 0) ||
2837 (strcmp(lower, "utf8") == 0))
2838 return PyUnicode_DecodeUTF8(s, size, errors);
2839 else if ((strcmp(lower, "latin-1") == 0) ||
2840 (strcmp(lower, "latin1") == 0) ||
2841 (strcmp(lower, "iso-8859-1") == 0))
2842 return PyUnicode_DecodeLatin1(s, size, errors);
2843 #ifdef HAVE_MBCS
2844 else if (strcmp(lower, "mbcs") == 0)
2845 return PyUnicode_DecodeMBCS(s, size, errors);
2846 #endif
2847 else if (strcmp(lower, "ascii") == 0)
2848 return PyUnicode_DecodeASCII(s, size, errors);
2849 else if (strcmp(lower, "utf-16") == 0)
2850 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2851 else if (strcmp(lower, "utf-32") == 0)
2852 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2853 }
2854
2855 /* Decode via the codec registry */
2856 buffer = NULL;
2857 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2858 goto onError;
2859 buffer = PyMemoryView_FromBuffer(&info);
2860 if (buffer == NULL)
2861 goto onError;
2862 unicode = PyCodec_Decode(buffer, encoding, errors);
2863 if (unicode == NULL)
2864 goto onError;
2865 if (!PyUnicode_Check(unicode)) {
2866 PyErr_Format(PyExc_TypeError,
2867 "decoder did not return a str object (type=%.400s)",
2868 Py_TYPE(unicode)->tp_name);
2869 Py_DECREF(unicode);
2870 goto onError;
2871 }
2872 Py_DECREF(buffer);
2873 return unicode_result(unicode);
2874
2875 onError:
2876 Py_XDECREF(buffer);
2877 return NULL;
2878 }
2879
2880 PyObject *
2881 PyUnicode_AsDecodedObject(PyObject *unicode,
2882 const char *encoding,
2883 const char *errors)
2884 {
2885 PyObject *v;
2886
2887 if (!PyUnicode_Check(unicode)) {
2888 PyErr_BadArgument();
2889 goto onError;
2890 }
2891
2892 if (encoding == NULL)
2893 encoding = PyUnicode_GetDefaultEncoding();
2894
2895 /* Decode via the codec registry */
2896 v = PyCodec_Decode(unicode, encoding, errors);
2897 if (v == NULL)
2898 goto onError;
2899 return unicode_result(v);
2900
2901 onError:
2902 return NULL;
2903 }
2904
2905 PyObject *
2906 PyUnicode_AsDecodedUnicode(PyObject *unicode,
2907 const char *encoding,
2908 const char *errors)
2909 {
2910 PyObject *v;
2911
2912 if (!PyUnicode_Check(unicode)) {
2913 PyErr_BadArgument();
2914 goto onError;
2915 }
2916
2917 if (encoding == NULL)
2918 encoding = PyUnicode_GetDefaultEncoding();
2919
2920 /* Decode via the codec registry */
2921 v = PyCodec_Decode(unicode, encoding, errors);
2922 if (v == NULL)
2923 goto onError;
2924 if (!PyUnicode_Check(v)) {
2925 PyErr_Format(PyExc_TypeError,
2926 "decoder did not return a str object (type=%.400s)",
2927 Py_TYPE(v)->tp_name);
2928 Py_DECREF(v);
2929 goto onError;
2930 }
2931 return unicode_result(v);
2932
2933 onError:
2934 return NULL;
2935 }
2936
2937 PyObject *
2938 PyUnicode_Encode(const Py_UNICODE *s,
2939 Py_ssize_t size,
2940 const char *encoding,
2941 const char *errors)
2942 {
2943 PyObject *v, *unicode;
2944
2945 unicode = PyUnicode_FromUnicode(s, size);
2946 if (unicode == NULL)
2947 return NULL;
2948 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2949 Py_DECREF(unicode);
2950 return v;
2951 }
2952
2953 PyObject *
2954 PyUnicode_AsEncodedObject(PyObject *unicode,
2955 const char *encoding,
2956 const char *errors)
2957 {
2958 PyObject *v;
2959
2960 if (!PyUnicode_Check(unicode)) {
2961 PyErr_BadArgument();
2962 goto onError;
2963 }
2964
2965 if (encoding == NULL)
2966 encoding = PyUnicode_GetDefaultEncoding();
2967
2968 /* Encode via the codec registry */
2969 v = PyCodec_Encode(unicode, encoding, errors);
2970 if (v == NULL)
2971 goto onError;
2972 return v;
2973
2974 onError:
2975 return NULL;
2976 }
2977
2978 PyObject *
2979 PyUnicode_EncodeFSDefault(PyObject *unicode)
2980 {
2981 #ifdef HAVE_MBCS
2982 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
2983 #elif defined(__APPLE__)
2984 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
2985 #else
2986 PyInterpreterState *interp = PyThreadState_GET()->interp;
2987 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2988 cannot use it to encode and decode filenames before it is loaded. Load
2989 the Python codec requires to encode at least its own filename. Use the C
2990 version of the locale codec until the codec registry is initialized and
2991 the Python codec is loaded.
2992
2993 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2994 cannot only rely on it: check also interp->fscodec_initialized for
2995 subinterpreters. */
2996 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2997 return PyUnicode_AsEncodedString(unicode,
2998 Py_FileSystemDefaultEncoding,
2999 "surrogateescape");
3000 }
3001 else {
3002 /* locale encoding with surrogateescape */
3003 wchar_t *wchar;
3004 char *bytes;
3005 PyObject *bytes_obj;
3006 size_t error_pos;
3007
3008 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3009 if (wchar == NULL)
3010 return NULL;
3011 bytes = _Py_wchar2char(wchar, &error_pos);
3012 if (bytes == NULL) {
3013 if (error_pos != (size_t)-1) {
3014 char *errmsg = strerror(errno);
3015 PyObject *exc = NULL;
3016 if (errmsg == NULL)
3017 errmsg = "Py_wchar2char() failed";
3018 raise_encode_exception(&exc,
3019 "filesystemencoding", unicode,
3020 error_pos, error_pos+1,
3021 errmsg);
3022 Py_XDECREF(exc);
3023 }
3024 else
3025 PyErr_NoMemory();
3026 PyMem_Free(wchar);
3027 return NULL;
3028 }
3029 PyMem_Free(wchar);
3030
3031 bytes_obj = PyBytes_FromString(bytes);
3032 PyMem_Free(bytes);
3033 return bytes_obj;
3034 }
3035 #endif
3036 }
3037
3038 PyObject *
3039 PyUnicode_AsEncodedString(PyObject *unicode,
3040 const char *encoding,
3041 const char *errors)
3042 {
3043 PyObject *v;
3044 char lower[11]; /* Enough for any encoding shortcut */
3045
3046 if (!PyUnicode_Check(unicode)) {
3047 PyErr_BadArgument();
3048 return NULL;
3049 }
3050
3051 /* Shortcuts for common default encodings */
3052 if (normalize_encoding(encoding, lower, sizeof(lower))) {
3053 if ((strcmp(lower, "utf-8") == 0) ||
3054 (strcmp(lower, "utf8") == 0))
3055 {
3056 if (errors == NULL || strcmp(errors, "strict") == 0)
3057 return _PyUnicode_AsUTF8String(unicode, NULL);
3058 else
3059 return _PyUnicode_AsUTF8String(unicode, errors);
3060 }
3061 else if ((strcmp(lower, "latin-1") == 0) ||
3062 (strcmp(lower, "latin1") == 0) ||
3063 (strcmp(lower, "iso-8859-1") == 0))
3064 return _PyUnicode_AsLatin1String(unicode, errors);
3065 #ifdef HAVE_MBCS
3066 else if (strcmp(lower, "mbcs") == 0)
3067 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3068 #endif
3069 else if (strcmp(lower, "ascii") == 0)
3070 return _PyUnicode_AsASCIIString(unicode, errors);
3071 }
3072
3073 /* Encode via the codec registry */
3074 v = PyCodec_Encode(unicode, encoding, errors);
3075 if (v == NULL)
3076 return NULL;
3077
3078 /* The normal path */
3079 if (PyBytes_Check(v))
3080 return v;
3081
3082 /* If the codec returns a buffer, raise a warning and convert to bytes */
3083 if (PyByteArray_Check(v)) {
3084 int error;
3085 PyObject *b;
3086
3087 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3088 "encoder %s returned bytearray instead of bytes",
3089 encoding);
3090 if (error) {
3091 Py_DECREF(v);
3092 return NULL;
3093 }
3094
3095 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3096 Py_DECREF(v);
3097 return b;
3098 }
3099
3100 PyErr_Format(PyExc_TypeError,
3101 "encoder did not return a bytes object (type=%.400s)",
3102 Py_TYPE(v)->tp_name);
3103 Py_DECREF(v);
3104 return NULL;
3105 }
3106
3107 PyObject *
3108 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3109 const char *encoding,
3110 const char *errors)
3111 {
3112 PyObject *v;
3113
3114 if (!PyUnicode_Check(unicode)) {
3115 PyErr_BadArgument();
3116 goto onError;
3117 }
3118
3119 if (encoding == NULL)
3120 encoding = PyUnicode_GetDefaultEncoding();
3121
3122 /* Encode via the codec registry */
3123 v = PyCodec_Encode(unicode, encoding, errors);
3124 if (v == NULL)
3125 goto onError;
3126 if (!PyUnicode_Check(v)) {
3127 PyErr_Format(PyExc_TypeError,
3128 "encoder did not return an str object (type=%.400s)",
3129 Py_TYPE(v)->tp_name);
3130 Py_DECREF(v);
3131 goto onError;
3132 }
3133 return v;
3134
3135 onError:
3136 return NULL;
3137 }
3138
3139 PyObject*
3140 PyUnicode_DecodeFSDefault(const char *s) {
3141 Py_ssize_t size = (Py_ssize_t)strlen(s);
3142 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3143 }
3144
3145 PyObject*
3146 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3147 {
3148 #ifdef HAVE_MBCS
3149 return PyUnicode_DecodeMBCS(s, size, NULL);
3150 #elif defined(__APPLE__)
3151 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3152 #else
3153 PyInterpreterState *interp = PyThreadState_GET()->interp;
3154 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3155 cannot use it to encode and decode filenames before it is loaded. Load
3156 the Python codec requires to encode at least its own filename. Use the C
3157 version of the locale codec until the codec registry is initialized and
3158 the Python codec is loaded.
3159
3160 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3161 cannot only rely on it: check also interp->fscodec_initialized for
3162 subinterpreters. */
3163 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3164 return PyUnicode_Decode(s, size,
3165 Py_FileSystemDefaultEncoding,
3166 "surrogateescape");
3167 }
3168 else {
3169 /* locale encoding with surrogateescape */
3170 wchar_t *wchar;
3171 PyObject *unicode;
3172 size_t len;
3173
3174 if (s[size] != '\0' || size != strlen(s)) {
3175 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3176 return NULL;
3177 }
3178
3179 wchar = _Py_char2wchar(s, &len);
3180 if (wchar == NULL)
3181 return PyErr_NoMemory();
3182
3183 unicode = PyUnicode_FromWideChar(wchar, len);
3184 PyMem_Free(wchar);
3185 return unicode;
3186 }
3187 #endif
3188 }
3189
3190
3191 int
3192 PyUnicode_FSConverter(PyObject* arg, void* addr)
3193 {
3194 PyObject *output = NULL;
3195 Py_ssize_t size;
3196 void *data;
3197 if (arg == NULL) {
3198 Py_DECREF(*(PyObject**)addr);
3199 return 1;
3200 }
3201 if (PyBytes_Check(arg)) {
3202 output = arg;
3203 Py_INCREF(output);
3204 }
3205 else {
3206 arg = PyUnicode_FromObject(arg);
3207 if (!arg)
3208 return 0;
3209 output = PyUnicode_EncodeFSDefault(arg);
3210 Py_DECREF(arg);
3211 if (!output)
3212 return 0;
3213 if (!PyBytes_Check(output)) {
3214 Py_DECREF(output);
3215 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3216 return 0;
3217 }
3218 }
3219 size = PyBytes_GET_SIZE(output);
3220 data = PyBytes_AS_STRING(output);
3221 if (size != strlen(data)) {
3222 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3223 Py_DECREF(output);
3224 return 0;
3225 }
3226 *(PyObject**)addr = output;
3227 return Py_CLEANUP_SUPPORTED;
3228 }
3229
3230
3231 int
3232 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3233 {
3234 PyObject *output = NULL;
3235 if (arg == NULL) {
3236 Py_DECREF(*(PyObject**)addr);
3237 return 1;
3238 }
3239 if (PyUnicode_Check(arg)) {
3240 if (PyUnicode_READY(arg))
3241 return 0;
3242 output = arg;
3243 Py_INCREF(output);
3244 }
3245 else {
3246 arg = PyBytes_FromObject(arg);
3247 if (!arg)
3248 return 0;
3249 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3250 PyBytes_GET_SIZE(arg));
3251 Py_DECREF(arg);
3252 if (!output)
3253 return 0;
3254 if (!PyUnicode_Check(output)) {
3255 Py_DECREF(output);
3256 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3257 return 0;
3258 }
3259 }
3260 if (PyUnicode_READY(output) < 0) {
3261 Py_DECREF(output);
3262 return 0;
3263 }
3264 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3265 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3266 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3267 Py_DECREF(output);
3268 return 0;
3269 }
3270 *(PyObject**)addr = output;
3271 return Py_CLEANUP_SUPPORTED;
3272 }
3273
3274
3275 char*
3276 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3277 {
3278 PyObject *bytes;
3279
3280 if (!PyUnicode_Check(unicode)) {
3281 PyErr_BadArgument();
3282 return NULL;
3283 }
3284 if (PyUnicode_READY(unicode) == -1)
3285 return NULL;
3286
3287 if (PyUnicode_UTF8(unicode) == NULL) {
3288 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3289 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3290 if (bytes == NULL)
3291 return NULL;
3292 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3293 if (_PyUnicode_UTF8(unicode) == NULL) {
3294 Py_DECREF(bytes);
3295 return NULL;
3296 }
3297 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3298 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3299 PyBytes_AS_STRING(bytes),
3300 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3301 Py_DECREF(bytes);
3302 }
3303
3304 if (psize)
3305 *psize = PyUnicode_UTF8_LENGTH(unicode);
3306 return PyUnicode_UTF8(unicode);
3307 }
3308
3309 char*
3310 PyUnicode_AsUTF8(PyObject *unicode)
3311 {
3312 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3313 }
3314
3315 #ifdef Py_DEBUG
3316 static int unicode_as_unicode_calls = 0;
3317 #endif
3318
3319
3320 Py_UNICODE *
3321 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3322 {
3323 const unsigned char *one_byte;
3324 #if SIZEOF_WCHAR_T == 4
3325 const Py_UCS2 *two_bytes;
3326 #else
3327 const Py_UCS4 *four_bytes;
3328 const Py_UCS4 *ucs4_end;
3329 Py_ssize_t num_surrogates;
3330 #endif
3331 wchar_t *w;
3332 wchar_t *wchar_end;
3333
3334 if (!PyUnicode_Check(unicode)) {
3335 PyErr_BadArgument();
3336 return NULL;
3337 }
3338 if (_PyUnicode_WSTR(unicode) == NULL) {
3339 /* Non-ASCII compact unicode object */
3340 assert(_PyUnicode_KIND(unicode) != 0);
3341 assert(PyUnicode_IS_READY(unicode));
3342
3343 #ifdef Py_DEBUG
3344 ++unicode_as_unicode_calls;
3345 #endif
3346
3347 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3348 #if SIZEOF_WCHAR_T == 2
3349 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3350 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3351 num_surrogates = 0;
3352
3353 for (; four_bytes < ucs4_end; ++four_bytes) {
3354 if (*four_bytes > 0xFFFF)
3355 ++num_surrogates;
3356 }
3357
3358 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3359 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3360 if (!_PyUnicode_WSTR(unicode)) {
3361 PyErr_NoMemory();
3362 return NULL;
3363 }
3364 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3365
3366 w = _PyUnicode_WSTR(unicode);
3367 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3368 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3369 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3370 if (*four_bytes > 0xFFFF) {
3371 assert(*four_bytes <= MAX_UNICODE);
3372 /* encode surrogate pair in this case */
3373 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3374 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3375 }
3376 else
3377 *w = *four_bytes;
3378
3379 if (w > wchar_end) {
3380 assert(0 && "Miscalculated string end");
3381 }
3382 }
3383 *w = 0;
3384 #else
3385 /* sizeof(wchar_t) == 4 */
3386 Py_FatalError("Impossible unicode object state, wstr and str "
3387 "should share memory already.");
3388 return NULL;
3389 #endif
3390 }
3391 else {
3392 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3393 (_PyUnicode_LENGTH(unicode) + 1));
3394 if (!_PyUnicode_WSTR(unicode)) {
3395 PyErr_NoMemory();
3396 return NULL;
3397 }
3398 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3399 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3400 w = _PyUnicode_WSTR(unicode);
3401 wchar_end = w + _PyUnicode_LENGTH(unicode);
3402
3403 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3404 one_byte = PyUnicode_1BYTE_DATA(unicode);
3405 for (; w < wchar_end; ++one_byte, ++w)
3406 *w = *one_byte;
3407 /* null-terminate the wstr */
3408 *w = 0;
3409 }
3410 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3411 #if SIZEOF_WCHAR_T == 4
3412 two_bytes = PyUnicode_2BYTE_DATA(unicode);
3413 for (; w < wchar_end; ++two_bytes, ++w)
3414 *w = *two_bytes;
3415 /* null-terminate the wstr */
3416 *w = 0;
3417 #else
3418 /* sizeof(wchar_t) == 2 */
3419 PyObject_FREE(_PyUnicode_WSTR(unicode));
3420 _PyUnicode_WSTR(unicode) = NULL;
3421 Py_FatalError("Impossible unicode object state, wstr "
3422 "and str should share memory already.");
3423 return NULL;
3424 #endif
3425 }
3426 else {
3427 assert(0 && "This should never happen.");
3428 }
3429 }
3430 }
3431 if (size != NULL)
3432 *size = PyUnicode_WSTR_LENGTH(unicode);
3433 return _PyUnicode_WSTR(unicode);
3434 }
3435
3436 Py_UNICODE *
3437 PyUnicode_AsUnicode(PyObject *unicode)
3438 {
3439 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3440 }
3441
3442
3443 Py_ssize_t
3444 PyUnicode_GetSize(PyObject *unicode)
3445 {
3446 if (!PyUnicode_Check(unicode)) {
3447 PyErr_BadArgument();
3448 goto onError;
3449 }
3450 return PyUnicode_GET_SIZE(unicode);
3451
3452 onError:
3453 return -1;
3454 }
3455
3456 Py_ssize_t
3457 PyUnicode_GetLength(PyObject *unicode)
3458 {
3459 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3460 PyErr_BadArgument();
3461 return -1;
3462 }
3463
3464 return PyUnicode_GET_LENGTH(unicode);
3465 }
3466
3467 Py_UCS4
3468 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3469 {
3470 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3471 PyErr_BadArgument();
3472 return (Py_UCS4)-1;
3473 }
3474 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3475 PyErr_SetString(PyExc_IndexError, "string index out of range");
3476 return (Py_UCS4)-1;
3477 }
3478 return PyUnicode_READ_CHAR(unicode, index);
3479 }
3480
3481 int
3482 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3483 {
3484 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3485 PyErr_BadArgument();
3486 return -1;
3487 }
3488 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3489 PyErr_SetString(PyExc_IndexError, "string index out of range");
3490 return -1;
3491 }
3492 if (_PyUnicode_Dirty(unicode))
3493 return -1;
3494 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3495 index, ch);
3496 return 0;
3497 }
3498
3499 const char *
3500 PyUnicode_GetDefaultEncoding(void)
3501 {
3502 return "utf-8";
3503 }
3504
3505 /* create or adjust a UnicodeDecodeError */
3506 static void
3507 make_decode_exception(PyObject **exceptionObject,
3508 const char *encoding,
3509 const char *input, Py_ssize_t length,
3510 Py_ssize_t startpos, Py_ssize_t endpos,
3511 const char *reason)
3512 {
3513 if (*exceptionObject == NULL) {
3514 *exceptionObject = PyUnicodeDecodeError_Create(
3515 encoding, input, length, startpos, endpos, reason);
3516 }
3517 else {
3518 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3519 goto onError;
3520 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3521 goto onError;
3522 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3523 goto onError;
3524 }
3525 return;
3526
3527 onError:
3528 Py_DECREF(*exceptionObject);
3529 *exceptionObject = NULL;
3530 }
3531
3532 /* error handling callback helper:
3533 build arguments, call the callback and check the arguments,
3534 if no exception occurred, copy the replacement to the output
3535 and adjust various state variables.
3536 return 0 on success, -1 on error
3537 */
3538
3539 static int
3540 unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3541 const char *encoding, const char *reason,
3542 const char **input, const char **inend, Py_ssize_t *startinpos,
3543 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3544 PyObject **output, Py_ssize_t *outpos)
3545 {
3546 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3547
3548 PyObject *restuple = NULL;
3549 PyObject *repunicode = NULL;
3550 Py_ssize_t outsize;
3551 Py_ssize_t insize;
3552 Py_ssize_t requiredsize;
3553 Py_ssize_t newpos;
3554 PyObject *inputobj = NULL;
3555 int res = -1;
3556
3557 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3558 outsize = PyUnicode_GET_LENGTH(*output);
3559 else
3560 outsize = _PyUnicode_WSTR_LENGTH(*output);
3561
3562 if (*errorHandler == NULL) {
3563 *errorHandler = PyCodec_LookupError(errors);
3564 if (*errorHandler == NULL)
3565 goto onError;
3566 }
3567
3568 make_decode_exception(exceptionObject,
3569 encoding,
3570 *input, *inend - *input,
3571 *startinpos, *endinpos,
3572 reason);
3573 if (*exceptionObject == NULL)
3574 goto onError;
3575
3576 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3577 if (restuple == NULL)
3578 goto onError;
3579 if (!PyTuple_Check(restuple)) {
3580 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3581 goto onError;
3582 }
3583 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3584 goto onError;
3585 if (PyUnicode_READY(repunicode) < 0)
3586 goto onError;
3587
3588 /* Copy back the bytes variables, which might have been modified by the
3589 callback */
3590 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3591 if (!inputobj)
3592 goto onError;
3593 if (!PyBytes_Check(inputobj)) {
3594 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3595 }
3596 *input = PyBytes_AS_STRING(inputobj);
3597 insize = PyBytes_GET_SIZE(inputobj);
3598 *inend = *input + insize;
3599 /* we can DECREF safely, as the exception has another reference,
3600 so the object won't go away. */
3601 Py_DECREF(inputobj);
3602
3603 if (newpos<0)
3604 newpos = insize+newpos;
3605 if (newpos<0 || newpos>insize) {
3606 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3607 goto onError;
3608 }
3609
3610 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3611 /* need more space? (at least enough for what we
3612 have+the replacement+the rest of the string (starting
3613 at the new input position), so we won't have to check space
3614 when there are no errors in the rest of the string) */
3615 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3616 requiredsize = *outpos + replen + insize-newpos;
3617 if (requiredsize > outsize) {
3618 if (requiredsize<2*outsize)
3619 requiredsize = 2*outsize;
3620 if (unicode_resize(output, requiredsize) < 0)
3621 goto onError;
3622 }
3623 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
3624 goto onError;
3625 copy_characters(*output, *outpos, repunicode, 0, replen);
3626 *outpos += replen;
3627 }
3628 else {
3629 wchar_t *repwstr;
3630 Py_ssize_t repwlen;
3631 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3632 if (repwstr == NULL)
3633 goto onError;
3634 /* need more space? (at least enough for what we
3635 have+the replacement+the rest of the string (starting
3636 at the new input position), so we won't have to check space
3637 when there are no errors in the rest of the string) */
3638 requiredsize = *outpos + repwlen + insize-newpos;
3639 if (requiredsize > outsize) {
3640 if (requiredsize < 2*outsize)
3641 requiredsize = 2*outsize;
3642 if (unicode_resize(output, requiredsize) < 0)
3643 goto onError;
3644 }
3645 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3646 *outpos += repwlen;
3647 }
3648 *endinpos = newpos;
3649 *inptr = *input + newpos;
3650
3651 /* we made it! */
3652 res = 0;
3653
3654 onError:
3655 Py_XDECREF(restuple);
3656 return res;
3657 }
3658
3659 /* --- UTF-7 Codec -------------------------------------------------------- */
3660
3661 /* See RFC2152 for details. We encode conservatively and decode liberally. */
3662
3663 /* Three simple macros defining base-64. */
3664
3665 /* Is c a base-64 character? */
3666
3667 #define IS_BASE64(c) \
3668 (((c) >= 'A' && (c) <= 'Z') || \
3669 ((c) >= 'a' && (c) <= 'z') || \
3670 ((c) >= '0' && (c) <= '9') || \
3671 (c) == '+' || (c) == '/')
3672
3673 /* given that c is a base-64 character, what is its base-64 value? */
3674
3675 #define FROM_BASE64(c) \
3676 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3677 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3678 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3679 (c) == '+' ? 62 : 63)
3680
3681 /* What is the base-64 character of the bottom 6 bits of n? */
3682
3683 #define TO_BASE64(n) \
3684 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3685
3686 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3687 * decoded as itself. We are permissive on decoding; the only ASCII
3688 * byte not decoding to itself is the + which begins a base64
3689 * string. */
3690
3691 #define DECODE_DIRECT(c) \
3692 ((c) <= 127 && (c) != '+')
3693
3694 /* The UTF-7 encoder treats ASCII characters differently according to
3695 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3696 * the above). See RFC2152. This array identifies these different
3697 * sets:
3698 * 0 : "Set D"
3699 * alphanumeric and '(),-./:?
3700 * 1 : "Set O"
3701 * !"#$%&*;<=>@[]^_`{|}
3702 * 2 : "whitespace"
3703 * ht nl cr sp
3704 * 3 : special (must be base64 encoded)
3705 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3706 */
3707
3708 static
3709 char utf7_category[128] = {
3710 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3711 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3712 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3713 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3714 /* sp ! " # $ % & ' ( ) * + , - . / */
3715 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3716 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3718 /* @ A B C D E F G H I J K L M N O */
3719 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3720 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
3721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3722 /* ` a b c d e f g h i j k l m n o */
3723 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3724 /* p q r s t u v w x y z { | } ~ del */
3725 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
3726 };
3727
3728 /* ENCODE_DIRECT: this character should be encoded as itself. The
3729 * answer depends on whether we are encoding set O as itself, and also
3730 * on whether we are encoding whitespace as itself. RFC2152 makes it
3731 * clear that the answers to these questions vary between
3732 * applications, so this code needs to be flexible. */
3733
3734 #define ENCODE_DIRECT(c, directO, directWS) \
3735 ((c) < 128 && (c) > 0 && \
3736 ((utf7_category[(c)] == 0) || \
3737 (directWS && (utf7_category[(c)] == 2)) || \
3738 (directO && (utf7_category[(c)] == 1))))
3739
3740 PyObject *
3741 PyUnicode_DecodeUTF7(const char *s,
3742 Py_ssize_t size,
3743 const char *errors)
3744 {
3745 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3746 }
3747
3748 /* The decoder. The only state we preserve is our read position,
3749 * i.e. how many characters we have consumed. So if we end in the
3750 * middle of a shift sequence we have to back off the read position
3751 * and the output to the beginning of the sequence, otherwise we lose
3752 * all the shift state (seen bits, number of bits seen, high
3753 * surrogate). */
3754
3755 PyObject *
3756 PyUnicode_DecodeUTF7Stateful(const char *s,
3757 Py_ssize_t size,
3758 const char *errors,
3759 Py_ssize_t *consumed)
3760 {
3761 const char *starts = s;
3762 Py_ssize_t startinpos;
3763 Py_ssize_t endinpos;
3764 Py_ssize_t outpos;
3765 const char *e;
3766 PyObject *unicode;
3767 const char *errmsg = "";
3768 int inShift = 0;
3769 Py_ssize_t shiftOutStart;
3770 unsigned int base64bits = 0;
3771 unsigned long base64buffer = 0;
3772 Py_UCS4 surrogate = 0;
3773 PyObject *errorHandler = NULL;
3774 PyObject *exc = NULL;
3775
3776 /* Start off assuming it's all ASCII. Widen later as necessary. */
3777 unicode = PyUnicode_New(size, 127);
3778 if (!unicode)
3779 return NULL;
3780 if (size == 0) {
3781 if (consumed)
3782 *consumed = 0;
3783 return unicode;
3784 }
3785
3786 shiftOutStart = outpos = 0;
3787 e = s + size;
3788
3789 while (s < e) {
3790 Py_UCS4 ch;
3791 restart:
3792 ch = (unsigned char) *s;
3793
3794 if (inShift) { /* in a base-64 section */
3795 if (IS_BASE64(ch)) { /* consume a base-64 character */
3796 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3797 base64bits += 6;
3798 s++;
3799 if (base64bits >= 16) {
3800 /* we have enough bits for a UTF-16 value */
3801 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
3802 base64bits -= 16;
3803 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3804 if (surrogate) {
3805 /* expecting a second surrogate */
3806 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3807 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
3808 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3809 goto onError;
3810 surrogate = 0;
3811 continue;
3812 }
3813 else {
3814 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3815 goto onError;
3816 surrogate = 0;
3817 }
3818 }
3819 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
3820 /* first surrogate */
3821 surrogate = outCh;
3822 }
3823 else {
3824 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3825 goto onError;
3826 }
3827 }
3828 }
3829 else { /* now leaving a base-64 section */
3830 inShift = 0;
3831 s++;
3832 if (surrogate) {
3833 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3834 goto onError;
3835 surrogate = 0;
3836 }
3837 if (base64bits > 0) { /* left-over bits */
3838 if (base64bits >= 6) {
3839 /* We've seen at least one base-64 character */
3840 errmsg = "partial character in shift sequence";
3841 goto utf7Error;
3842 }
3843 else {
3844 /* Some bits remain; they should be zero */
3845 if (base64buffer != 0) {
3846 errmsg = "non-zero padding bits in shift sequence";
3847 goto utf7Error;
3848 }
3849 }
3850 }
3851 if (ch != '-') {
3852 /* '-' is absorbed; other terminating
3853 characters are preserved */
3854 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3855 goto onError;
3856 }
3857 }
3858 }
3859 else if ( ch == '+' ) {
3860 startinpos = s-starts;
3861 s++; /* consume '+' */
3862 if (s < e && *s == '-') { /* '+-' encodes '+' */
3863 s++;
3864 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3865 goto onError;
3866 }
3867 else { /* begin base64-encoded section */
3868 inShift = 1;
3869 shiftOutStart = outpos;
3870 base64bits = 0;
3871 }
3872 }
3873 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3874 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3875 goto onError;
3876 s++;
3877 }
3878 else {
3879 startinpos = s-starts;
3880 s++;
3881 errmsg = "unexpected special character";
3882 goto utf7Error;
3883 }
3884 continue;
3885 utf7Error:
3886 endinpos = s-starts;
3887 if (unicode_decode_call_errorhandler(
3888 errors, &errorHandler,
3889 "utf7", errmsg,
3890 &starts, &e, &startinpos, &endinpos, &exc, &s,
3891 &unicode, &outpos))
3892 goto onError;
3893 }
3894
3895 /* end of string */
3896
3897 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3898 /* if we're in an inconsistent state, that's an error */
3899 if (surrogate ||
3900 (base64bits >= 6) ||
3901 (base64bits > 0 && base64buffer != 0)) {
3902 endinpos = size;
3903 if (unicode_decode_call_errorhandler(
3904 errors, &errorHandler,
3905 "utf7", "unterminated shift sequence",
3906 &starts, &e, &startinpos, &endinpos, &exc, &s,
3907 &unicode, &outpos))
3908 goto onError;
3909 if (s < e)
3910 goto restart;
3911 }
3912 }
3913
3914 /* return state */
3915 if (consumed) {
3916 if (inShift) {
3917 outpos = shiftOutStart; /* back off output */
3918 *consumed = startinpos;
3919 }
3920 else {
3921 *consumed = s-starts;
3922 }
3923 }
3924
3925 if (unicode_resize(&unicode, outpos) < 0)
3926 goto onError;
3927
3928 Py_XDECREF(errorHandler);
3929 Py_XDECREF(exc);
3930 return unicode_result(unicode);
3931
3932 onError:
3933 Py_XDECREF(errorHandler);
3934 Py_XDECREF(exc);
3935 Py_DECREF(unicode);
3936 return NULL;
3937 }
3938
3939
3940 PyObject *
3941 _PyUnicode_EncodeUTF7(PyObject *str,
3942 int base64SetO,
3943 int base64WhiteSpace,
3944 const char *errors)
3945 {
3946 int kind;
3947 void *data;
3948 Py_ssize_t len;
3949 PyObject *v;
3950 Py_ssize_t allocated;
3951 int inShift = 0;
3952 Py_ssize_t i;
3953 unsigned int base64bits = 0;
3954 unsigned long base64buffer = 0;
3955 char * out;
3956 char * start;
3957
3958 if (PyUnicode_READY(str) < 0)
3959 return NULL;
3960 kind = PyUnicode_KIND(str);
3961 data = PyUnicode_DATA(str);
3962 len = PyUnicode_GET_LENGTH(str);
3963
3964 if (len == 0)
3965 return PyBytes_FromStringAndSize(NULL, 0);
3966
3967 /* It might be possible to tighten this worst case */
3968 allocated = 8 * len;
3969 if (allocated / 8 != len)
3970 return PyErr_NoMemory();
3971
3972 v = PyBytes_FromStringAndSize(NULL, allocated);
3973 if (v == NULL)
3974 return NULL;
3975
3976 start = out = PyBytes_AS_STRING(v);
3977 for (i = 0; i < len; ++i) {
3978 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
3979
3980 if (inShift) {
3981 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3982 /* shifting out */
3983 if (base64bits) { /* output remaining bits */
3984 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3985 base64buffer = 0;
3986 base64bits = 0;
3987 }
3988 inShift = 0;
3989 /* Characters not in the BASE64 set implicitly unshift the sequence
3990 so no '-' is required, except if the character is itself a '-' */
3991 if (IS_BASE64(ch) || ch == '-') {
3992 *out++ = '-';
3993 }
3994 *out++ = (char) ch;
3995 }
3996 else {
3997 goto encode_char;
3998 }
3999 }
4000 else { /* not in a shift sequence */
4001 if (ch == '+') {
4002 *out++ = '+';
4003 *out++ = '-';
4004 }
4005 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4006 *out++ = (char) ch;
4007 }
4008 else {
4009 *out++ = '+';
4010 inShift = 1;
4011 goto encode_char;
4012 }
4013 }
4014 continue;
4015 encode_char:
4016 if (ch >= 0x10000) {
4017 assert(ch <= MAX_UNICODE);
4018
4019 /* code first surrogate */
4020 base64bits += 16;
4021 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4022 while (base64bits >= 6) {
4023 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4024 base64bits -= 6;
4025 }
4026 /* prepare second surrogate */
4027 ch = Py_UNICODE_LOW_SURROGATE(ch);
4028 }
4029 base64bits += 16;
4030 base64buffer = (base64buffer << 16) | ch;
4031 while (base64bits >= 6) {
4032 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4033 base64bits -= 6;
4034 }
4035 }
4036 if (base64bits)
4037 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4038 if (inShift)
4039 *out++ = '-';
4040 if (_PyBytes_Resize(&v, out - start) < 0)
4041 return NULL;
4042 return v;
4043 }
4044 PyObject *
4045 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4046 Py_ssize_t size,
4047 int base64SetO,
4048 int base64WhiteSpace,
4049 const char *errors)
4050 {
4051 PyObject *result;
4052 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4053 if (tmp == NULL)
4054 return NULL;
4055 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4056 base64WhiteSpace, errors);
4057 Py_DECREF(tmp);
4058 return result;
4059 }
4060
4061 #undef IS_BASE64
4062 #undef FROM_BASE64
4063 #undef TO_BASE64
4064 #undef DECODE_DIRECT
4065 #undef ENCODE_DIRECT
4066
4067 /* --- UTF-8 Codec -------------------------------------------------------- */
4068
4069 static
4070 char utf8_code_length[256] = {
4071 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4072 illegal prefix. See RFC 3629 for details */
4073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
4082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4085 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4086 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4087 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4088 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
4089 };
4090
4091 PyObject *
4092 PyUnicode_DecodeUTF8(const char *s,
4093 Py_ssize_t size,
4094 const char *errors)
4095 {
4096 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4097 }
4098
4099 #include "stringlib/ucs1lib.h"
4100 #include "stringlib/codecs.h"
4101 #include "stringlib/undef.h"
4102
4103 #include "stringlib/ucs2lib.h"
4104 #include "stringlib/codecs.h"
4105 #include "stringlib/undef.h"
4106
4107 #include "stringlib/ucs4lib.h"
4108 #include "stringlib/codecs.h"
4109 #include "stringlib/undef.h"
4110
4111 /* Mask to check or force alignment of a pointer to C 'long' boundaries */
4112 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4113
4114 /* Mask to quickly check whether a C 'long' contains a
4115 non-ASCII, UTF8-encoded char. */
4116 #if (SIZEOF_LONG == 8)
4117 # define ASCII_CHAR_MASK 0x8080808080808080L
4118 #elif (SIZEOF_LONG == 4)
4119 # define ASCII_CHAR_MASK 0x80808080L
4120 #else
4121 # error C 'long' size should be either 4 or 8!
4122 #endif
4123
4124 /* Scans a UTF-8 string and returns the maximum character to be expected
4125 and the size of the decoded unicode string.
4126
4127 This function doesn't check for errors, these checks are performed in
4128 PyUnicode_DecodeUTF8Stateful.
4129 */
4130 static Py_UCS4
4131 utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4132 Py_ssize_t *unicode_size)
4133 {
4134 Py_ssize_t char_count = 0;
4135 const unsigned char *p = (const unsigned char *)s;
4136 const unsigned char *end = p + string_size;
4137 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4138
4139 assert(unicode_size != NULL);
4140
4141 /* By having a cascade of independent loops which fallback onto each
4142 other, we minimize the amount of work done in the average loop
4143 iteration, and we also maximize the CPU's ability to predict
4144 branches correctly (because a given condition will have always the
4145 same boolean outcome except perhaps in the last iteration of the
4146 corresponding loop).
4147 In the general case this brings us rather close to decoding
4148 performance pre-PEP 393, despite the two-pass decoding.
4149
4150 Note that the pure ASCII loop is not duplicated once a non-ASCII
4151 character has been encountered. It is actually a pessimization (by
4152 a significant factor) to use this loop on text with many non-ASCII
4153 characters, and it is important to avoid bad performance on valid
4154 utf-8 data (invalid utf-8 being a different can of worms).
4155 */
4156
4157 /* ASCII */
4158 for (; p < end; ++p) {
4159 /* Only check value if it's not a ASCII char... */
4160 if (*p < 0x80) {
4161 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4162 an explanation. */
4163 if (!((size_t) p & LONG_PTR_MASK)) {
4164 /* Help register allocation */
4165 register const unsigned char *_p = p;
4166 while (_p < aligned_end) {
4167 unsigned long value = *(unsigned long *) _p;
4168 if (value & ASCII_CHAR_MASK)
4169 break;
4170 _p += SIZEOF_LONG;
4171 char_count += SIZEOF_LONG;
4172 }
4173 p = _p;
4174 if (p == end)
4175 break;
4176 }
4177 }
4178 if (*p < 0x80)
4179 ++char_count;
4180 else
4181 goto _ucs1loop;
4182 }
4183 *unicode_size = char_count;
4184 return 127;
4185
4186 _ucs1loop:
4187 for (; p < end; ++p) {
4188 if (*p < 0xc4)
4189 char_count += ((*p & 0xc0) != 0x80);
4190 else
4191 goto _ucs2loop;
4192 }
4193 *unicode_size = char_count;
4194 return 255;
4195
4196 _ucs2loop:
4197 for (; p < end; ++p) {
4198 if (*p < 0xf0)
4199 char_count += ((*p & 0xc0) != 0x80);
4200 else
4201 goto _ucs4loop;
4202 }
4203 *unicode_size = char_count;
4204 return 65535;
4205
4206 _ucs4loop:
4207 for (; p < end; ++p) {
4208 char_count += ((*p & 0xc0) != 0x80);
4209 }
4210 *unicode_size = char_count;
4211 return 65537;
4212 }
4213
4214 /* Called when we encountered some error that wasn't detected in the original
4215 scan, e.g. an encoded surrogate character. The original maxchar computation
4216 may have been incorrect, so redo it. */
4217 static int
4218 refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4219 {
4220 PyObject *tmp;
4221 Py_ssize_t k;
4222 Py_UCS4 maxchar;
4223 for (k = 0, maxchar = 0; k < n; k++)
4224 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4225 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4226 if (tmp == NULL)
4227 return -1;
4228 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4229 Py_DECREF(*unicode);
4230 *unicode = tmp;
4231 return 0;
4232 }
4233
4234 /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4235 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4236 onError. Potential resizing overallocates, so the result needs to shrink
4237 at the end.
4238 */
4239 #define WRITE_MAYBE_FAIL(index, value) \
4240 do { \
4241 if (has_errors) { \
4242 Py_ssize_t pos = index; \
4243 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4244 unicode_resize(&unicode, pos + pos/8) < 0) \
4245 goto onError; \
4246 if (unicode_putchar(&unicode, &pos, value) < 0) \
4247 goto onError; \
4248 } \
4249 else \
4250 PyUnicode_WRITE(kind, data, index, value); \
4251 } while (0)
4252
4253 PyObject *
4254 PyUnicode_DecodeUTF8Stateful(const char *s,
4255 Py_ssize_t size,
4256 const char *errors,
4257 Py_ssize_t *consumed)
4258 {
4259 const char *starts = s;
4260 int n;
4261 int k;
4262 Py_ssize_t startinpos;
4263 Py_ssize_t endinpos;
4264 const char *e, *aligned_end;
4265 PyObject *unicode;
4266 const char *errmsg = "";
4267 PyObject *errorHandler = NULL;
4268 PyObject *exc = NULL;
4269 Py_UCS4 maxchar = 0;
4270 Py_ssize_t unicode_size;
4271 Py_ssize_t i;
4272 int kind;
4273 void *data;
4274 int has_errors = 0;
4275
4276 if (size == 0) {
4277 if (consumed)
4278 *consumed = 0;
4279 return (PyObject *)PyUnicode_New(0, 0);
4280 }
4281 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
4282 /* When the string is ASCII only, just use memcpy and return.
4283 unicode_size may be != size if there is an incomplete UTF-8
4284 sequence at the end of the ASCII block. */
4285 if (maxchar < 128 && size == unicode_size) {
4286 if (consumed)
4287 *consumed = size;
4288
4289 if (size == 1)
4290 return get_latin1_char((unsigned char)s[0]);
4291
4292 unicode = PyUnicode_New(unicode_size, maxchar);
4293 if (!unicode)
4294 return NULL;
4295 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4296 assert(_PyUnicode_CheckConsistency(unicode, 1));
4297 return unicode;
4298 }
4299
4300 /* In case of errors, maxchar and size computation might be incorrect;
4301 code below refits and resizes as necessary. */
4302 unicode = PyUnicode_New(unicode_size, maxchar);
4303 if (!unicode)
4304 return NULL;
4305 kind = PyUnicode_KIND(unicode);
4306 data = PyUnicode_DATA(unicode);
4307
4308 /* Unpack UTF-8 encoded data */
4309 i = 0;
4310 e = s + size;
4311 switch (kind) {
4312 case PyUnicode_1BYTE_KIND:
4313 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4314 break;
4315 case PyUnicode_2BYTE_KIND:
4316 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4317 break;
4318 case PyUnicode_4BYTE_KIND:
4319 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4320 break;
4321 }
4322 if (!has_errors) {
4323 /* Ensure the unicode size calculation was correct */
4324 assert(i == unicode_size);
4325 assert(s == e);
4326 if (consumed)
4327 *consumed = s-starts;
4328 return unicode;
4329 }
4330 /* Fall through to the generic decoding loop for the rest of
4331 the string */
4332 if (refit_partial_string(&unicode, kind, data, i) < 0)
4333 goto onError;
4334
4335 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4336
4337 while (s < e) {
4338 Py_UCS4 ch = (unsigned char)*s;
4339
4340 if (ch < 0x80) {
4341 /* Fast path for runs of ASCII characters. Given that common UTF-8
4342 input will consist of an overwhelming majority of ASCII
4343 characters, we try to optimize for this case by checking
4344 as many characters as a C 'long' can contain.
4345 First, check if we can do an aligned read, as most CPUs have
4346 a penalty for unaligned reads.
4347 */
4348 if (!((size_t) s & LONG_PTR_MASK)) {
4349 /* Help register allocation */
4350 register const char *_s = s;
4351 register Py_ssize_t _i = i;
4352 while (_s < aligned_end) {
4353 /* Read a whole long at a time (either 4 or 8 bytes),
4354 and do a fast unrolled copy if it only contains ASCII
4355 characters. */
4356 unsigned long value = *(unsigned long *) _s;
4357 if (value & ASCII_CHAR_MASK)
4358 break;
4359 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4360 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4361 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4362 WRITE_MAYBE_FAIL(_i+3, _s[3]);
4363 #if (SIZEOF_LONG == 8)
4364 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4365 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4366 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4367 WRITE_MAYBE_FAIL(_i+7, _s[7]);
4368 #endif
4369 _s += SIZEOF_LONG;
4370 _i += SIZEOF_LONG;
4371 }
4372 s = _s;
4373 i = _i;
4374 if (s == e)
4375 break;
4376 ch = (unsigned char)*s;
4377 }
4378 }
4379
4380 if (ch < 0x80) {
4381 WRITE_MAYBE_FAIL(i++, ch);
4382 s++;
4383 continue;
4384 }
4385
4386 n = utf8_code_length[ch];
4387
4388 if (s + n > e) {
4389 if (consumed)
4390 break;
4391 else {
4392 errmsg = "unexpected end of data";
4393 startinpos = s-starts;
4394 endinpos = startinpos+1;
4395 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4396 endinpos++;
4397 goto utf8Error;
4398 }
4399 }
4400
4401 switch (n) {
4402
4403 case 0:
4404 errmsg = "invalid start byte";
4405 startinpos = s-starts;
4406 endinpos = startinpos+1;
4407 goto utf8Error;
4408
4409 case 1:
4410 errmsg = "internal error";
4411 startinpos = s-starts;
4412 endinpos = startinpos+1;
4413 goto utf8Error;
4414
4415 case 2:
4416 if ((s[1] & 0xc0) != 0x80) {
4417 errmsg = "invalid continuation byte";
4418 startinpos = s-starts;
4419 endinpos = startinpos + 1;
4420 goto utf8Error;
4421 }
4422 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4423 assert ((ch > 0x007F) && (ch <= 0x07FF));
4424 WRITE_MAYBE_FAIL(i++, ch);
4425 break;
4426
4427 case 3:
4428 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4429 will result in surrogates in range d800-dfff. Surrogates are
4430 not valid UTF-8 so they are rejected.
4431 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4432 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4433 if ((s[1] & 0xc0) != 0x80 ||
4434 (s[2] & 0xc0) != 0x80 ||
4435 ((unsigned char)s[0] == 0xE0 &&
4436 (unsigned char)s[1] < 0xA0) ||
4437 ((unsigned char)s[0] == 0xED &&
4438 (unsigned char)s[1] > 0x9F)) {
4439 errmsg = "invalid continuation byte";
4440 startinpos = s-starts;
4441 endinpos = startinpos + 1;
4442
4443 /* if s[1] first two bits are 1 and 0, then the invalid
4444 continuation byte is s[2], so increment endinpos by 1,
4445 if not, s[1] is invalid and endinpos doesn't need to
4446 be incremented. */
4447 if ((s[1] & 0xC0) == 0x80)
4448 endinpos++;
4449 goto utf8Error;
4450 }
4451 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4452 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4453 WRITE_MAYBE_FAIL(i++, ch);
4454 break;
4455
4456 case 4:
4457 if ((s[1] & 0xc0) != 0x80 ||
4458 (s[2] & 0xc0) != 0x80 ||
4459 (s[3] & 0xc0) != 0x80 ||
4460 ((unsigned char)s[0] == 0xF0 &&
4461 (unsigned char)s[1] < 0x90) ||
4462 ((unsigned char)s[0] == 0xF4 &&
4463 (unsigned char)s[1] > 0x8F)) {
4464 errmsg = "invalid continuation byte";
4465 startinpos = s-starts;
4466 endinpos = startinpos + 1;
4467 if ((s[1] & 0xC0) == 0x80) {
4468 endinpos++;
4469 if ((s[2] & 0xC0) == 0x80)
4470 endinpos++;
4471 }
4472 goto utf8Error;
4473 }
4474 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4475 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4476 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
4477
4478 WRITE_MAYBE_FAIL(i++, ch);
4479 break;
4480 }
4481 s += n;
4482 continue;
4483
4484 utf8Error:
4485 if (!has_errors) {
4486 if (refit_partial_string(&unicode, kind, data, i) < 0)
4487 goto onError;
4488 has_errors = 1;
4489 }
4490 if (unicode_decode_call_errorhandler(
4491 errors, &errorHandler,
4492 "utf8", errmsg,
4493 &starts, &e, &startinpos, &endinpos, &exc, &s,
4494 &unicode, &i))
4495 goto onError;
4496 /* Update data because unicode_decode_call_errorhandler might have
4497 re-created or resized the unicode object. */
4498 data = PyUnicode_DATA(unicode);
4499 kind = PyUnicode_KIND(unicode);
4500 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4501 }
4502 /* Ensure the unicode_size calculation above was correct: */
4503 assert(has_errors || i == unicode_size);
4504
4505 if (consumed)
4506 *consumed = s-starts;
4507
4508 /* Adjust length and ready string when it contained errors and
4509 is of the old resizable kind. */
4510 if (has_errors) {
4511 if (PyUnicode_Resize(&unicode, i) < 0)
4512 goto onError;
4513 }
4514
4515 Py_XDECREF(errorHandler);
4516 Py_XDECREF(exc);
4517 assert(_PyUnicode_CheckConsistency(unicode, 1));
4518 return unicode;
4519
4520 onError:
4521 Py_XDECREF(errorHandler);
4522 Py_XDECREF(exc);
4523 Py_DECREF(unicode);
4524 return NULL;
4525 }
4526
4527 #undef WRITE_MAYBE_FAIL
4528
4529 #ifdef __APPLE__
4530
4531 /* Simplified UTF-8 decoder using surrogateescape error handler,
4532 used to decode the command line arguments on Mac OS X. */
4533
4534 wchar_t*
4535 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4536 {
4537 int n;
4538 const char *e;
4539 wchar_t *unicode, *p;
4540
4541 /* Note: size will always be longer than the resulting Unicode
4542 character count */
4543 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4544 PyErr_NoMemory();
4545 return NULL;
4546 }
4547 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4548 if (!unicode)
4549 return NULL;
4550
4551 /* Unpack UTF-8 encoded data */
4552 p = unicode;
4553 e = s + size;
4554 while (s < e) {
4555 Py_UCS4 ch = (unsigned char)*s;
4556
4557 if (ch < 0x80) {
4558 *p++ = (wchar_t)ch;
4559 s++;
4560 continue;
4561 }
4562
4563 n = utf8_code_length[ch];
4564 if (s + n > e) {
4565 goto surrogateescape;
4566 }
4567
4568 switch (n) {
4569 case 0:
4570 case 1:
4571 goto surrogateescape;
4572
4573 case 2:
4574 if ((s[1] & 0xc0) != 0x80)
4575 goto surrogateescape;
4576 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4577 assert ((ch > 0x007F) && (ch <= 0x07FF));
4578 *p++ = (wchar_t)ch;
4579 break;
4580
4581 case 3:
4582 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4583 will result in surrogates in range d800-dfff. Surrogates are
4584 not valid UTF-8 so they are rejected.
4585 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4586 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4587 if ((s[1] & 0xc0) != 0x80 ||
4588 (s[2] & 0xc0) != 0x80 ||
4589 ((unsigned char)s[0] == 0xE0 &&
4590 (unsigned char)s[1] < 0xA0) ||
4591 ((unsigned char)s[0] == 0xED &&
4592 (unsigned char)s[1] > 0x9F)) {
4593
4594 goto surrogateescape;
4595 }
4596 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4597 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4598 *p++ = (wchar_t)ch;
4599 break;
4600
4601 case 4:
4602 if ((s[1] & 0xc0) != 0x80 ||
4603 (s[2] & 0xc0) != 0x80 ||
4604 (s[3] & 0xc0) != 0x80 ||
4605 ((unsigned char)s[0] == 0xF0 &&
4606 (unsigned char)s[1] < 0x90) ||
4607 ((unsigned char)s[0] == 0xF4 &&
4608 (unsigned char)s[1] > 0x8F)) {
4609 goto surrogateescape;
4610 }
4611 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4612 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4613 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
4614
4615 #if SIZEOF_WCHAR_T == 4
4616 *p++ = (wchar_t)ch;
4617 #else
4618 /* compute and append the two surrogates: */
4619 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4620 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4621 #endif
4622 break;
4623 }
4624 s += n;
4625 continue;
4626
4627 surrogateescape:
4628 *p++ = 0xDC00 + ch;
4629 s++;
4630 }
4631 *p = L'\0';
4632 return unicode;
4633 }
4634
4635 #endif /* __APPLE__ */
4636
4637 /* Primary internal function which creates utf8 encoded bytes objects.
4638
4639 Allocation strategy: if the string is short, convert into a stack buffer
4640 and allocate exactly as much space needed at the end. Else allocate the
4641 maximum possible needed (4 result bytes per Unicode character), and return
4642 the excess memory at the end.
4643 */
4644 PyObject *
4645 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4646 {
4647 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
4648
4649 Py_ssize_t i; /* index into s of next input byte */
4650 PyObject *result; /* result string object */
4651 char *p; /* next free byte in output buffer */
4652 Py_ssize_t nallocated; /* number of result bytes allocated */
4653 Py_ssize_t nneeded; /* number of result bytes needed */
4654 char stackbuf[MAX_SHORT_UNICHARS * 4];
4655 PyObject *errorHandler = NULL;
4656 PyObject *exc = NULL;
4657 int kind;
4658 void *data;
4659 Py_ssize_t size;
4660 PyObject *rep = NULL;
4661
4662 if (!PyUnicode_Check(unicode)) {
4663 PyErr_BadArgument();
4664 return NULL;
4665 }
4666
4667 if (PyUnicode_READY(unicode) == -1)
4668 return NULL;
4669
4670 if (PyUnicode_UTF8(unicode))
4671 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4672 PyUnicode_UTF8_LENGTH(unicode));
4673
4674 kind = PyUnicode_KIND(unicode);
4675 data = PyUnicode_DATA(unicode);
4676 size = PyUnicode_GET_LENGTH(unicode);
4677
4678 assert(size >= 0);
4679
4680 if (size <= MAX_SHORT_UNICHARS) {
4681 /* Write into the stack buffer; nallocated can't overflow.
4682 * At the end, we'll allocate exactly as much heap space as it
4683 * turns out we need.
4684 */
4685 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4686 result = NULL; /* will allocate after we're done */
4687 p = stackbuf;
4688 }
4689 else {
4690 /* Overallocate on the heap, and give the excess back at the end. */
4691 nallocated = size * 4;
4692 if (nallocated / 4 != size) /* overflow! */
4693 return PyErr_NoMemory();
4694 result = PyBytes_FromStringAndSize(NULL, nallocated);
4695 if (result == NULL)
4696 return NULL;
4697 p = PyBytes_AS_STRING(result);
4698 }
4699
4700 for (i = 0; i < size;) {
4701 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4702
4703 if (ch < 0x80)
4704 /* Encode ASCII */
4705 *p++ = (char) ch;
4706
4707 else if (ch < 0x0800) {
4708 /* Encode Latin-1 */
4709 *p++ = (char)(0xc0 | (ch >> 6));
4710 *p++ = (char)(0x80 | (ch & 0x3f));
4711 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
4712 Py_ssize_t newpos;
4713 Py_ssize_t repsize, k, startpos;
4714 startpos = i-1;
4715 rep = unicode_encode_call_errorhandler(
4716 errors, &errorHandler, "utf-8", "surrogates not allowed",
4717 unicode, &exc, startpos, startpos+1, &newpos);
4718 if (!rep)
4719 goto error;
4720
4721 if (PyBytes_Check(rep))
4722 repsize = PyBytes_GET_SIZE(rep);
4723 else
4724 repsize = PyUnicode_GET_LENGTH(rep);
4725
4726 if (repsize > 4) {
4727 Py_ssize_t offset;
4728
4729 if (result == NULL)
4730 offset = p - stackbuf;
4731 else
4732 offset = p - PyBytes_AS_STRING(result);
4733
4734 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4735 /* integer overflow */
4736 PyErr_NoMemory();
4737 goto error;
4738 }
4739 nallocated += repsize - 4;
4740 if (result != NULL) {
4741 if (_PyBytes_Resize(&result, nallocated) < 0)
4742 goto error;
4743 } else {
4744 result = PyBytes_FromStringAndSize(NULL, nallocated);
4745 if (result == NULL)
4746 goto error;
4747 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4748 }
4749 p = PyBytes_AS_STRING(result) + offset;
4750 }
4751
4752 if (PyBytes_Check(rep)) {
4753 char *prep = PyBytes_AS_STRING(rep);
4754 for(k = repsize; k > 0; k--)
4755 *p++ = *prep++;
4756 } else /* rep is unicode */ {
4757 enum PyUnicode_Kind repkind;
4758 void *repdata;
4759
4760 if (PyUnicode_READY(rep) < 0)
4761 goto error;
4762 repkind = PyUnicode_KIND(rep);
4763 repdata = PyUnicode_DATA(rep);
4764
4765 for(k=0; k<repsize; k++) {
4766 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
4767 if (0x80 <= c) {
4768 raise_encode_exception(&exc, "utf-8",
4769 unicode,
4770 i-1, i,
4771 "surrogates not allowed");
4772 goto error;
4773 }
4774 *p++ = (char)c;
4775 }
4776 }
4777 Py_CLEAR(rep);
4778 } else if (ch < 0x10000) {
4779 *p++ = (char)(0xe0 | (ch >> 12));
4780 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4781 *p++ = (char)(0x80 | (ch & 0x3f));
4782 } else /* ch >= 0x10000 */ {
4783 assert(ch <= MAX_UNICODE);
4784 /* Encode UCS4 Unicode ordinals */
4785 *p++ = (char)(0xf0 | (ch >> 18));
4786 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4787 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4788 *p++ = (char)(0x80 | (ch & 0x3f));
4789 }
4790 }
4791
4792 if (result == NULL) {
4793 /* This was stack allocated. */
4794 nneeded = p - stackbuf;
4795 assert(nneeded <= nallocated);
4796 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4797 }
4798 else {
4799 /* Cut back to size actually needed. */
4800 nneeded = p - PyBytes_AS_STRING(result);
4801 assert(nneeded <= nallocated);
4802 _PyBytes_Resize(&result, nneeded);
4803 }
4804
4805 Py_XDECREF(errorHandler);
4806 Py_XDECREF(exc);
4807 return result;
4808 error:
4809 Py_XDECREF(rep);
4810 Py_XDECREF(errorHandler);
4811 Py_XDECREF(exc);
4812 Py_XDECREF(result);
4813 return NULL;
4814
4815 #undef MAX_SHORT_UNICHARS
4816 }
4817
4818 PyObject *
4819 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4820 Py_ssize_t size,
4821 const char *errors)
4822 {
4823 PyObject *v, *unicode;
4824
4825 unicode = PyUnicode_FromUnicode(s, size);
4826 if (unicode == NULL)
4827 return NULL;
4828 v = _PyUnicode_AsUTF8String(unicode, errors);
4829 Py_DECREF(unicode);
4830 return v;
4831 }
4832
4833 PyObject *
4834 PyUnicode_AsUTF8String(PyObject *unicode)
4835 {
4836 return _PyUnicode_AsUTF8String(unicode, NULL);
4837 }
4838
4839 /* --- UTF-32 Codec ------------------------------------------------------- */
4840
4841 PyObject *
4842 PyUnicode_DecodeUTF32(const char *s,
4843 Py_ssize_t size,
4844 const char *errors,
4845 int *byteorder)
4846 {
4847 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4848 }
4849
4850 PyObject *
4851 PyUnicode_DecodeUTF32Stateful(const char *s,
4852 Py_ssize_t size,
4853 const char *errors,
4854 int *byteorder,
4855 Py_ssize_t *consumed)
4856 {
4857 const char *starts = s;
4858 Py_ssize_t startinpos;
4859 Py_ssize_t endinpos;
4860 Py_ssize_t outpos;
4861 PyObject *unicode;
4862 const unsigned char *q, *e;
4863 int bo = 0; /* assume native ordering by default */
4864 const char *errmsg = "";
4865 /* Offsets from q for retrieving bytes in the right order. */
4866 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
4867 int iorder[] = {0, 1, 2, 3};
4868 #else
4869 int iorder[] = {3, 2, 1, 0};
4870 #endif
4871 PyObject *errorHandler = NULL;
4872 PyObject *exc = NULL;
4873
4874 q = (unsigned char *)s;
4875 e = q + size;
4876
4877 if (byteorder)
4878 bo = *byteorder;
4879
4880 /* Check for BOM marks (U+FEFF) in the input and adjust current
4881 byte order setting accordingly. In native mode, the leading BOM
4882 mark is skipped, in all other modes, it is copied to the output
4883 stream as-is (giving a ZWNBSP character). */
4884 if (bo == 0) {
4885 if (size >= 4) {
4886 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4887 (q[iorder[1]] << 8) | q[iorder[0]];
4888 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
4889 if (bom == 0x0000FEFF) {
4890 q += 4;
4891 bo = -1;
4892 }
4893 else if (bom == 0xFFFE0000) {
4894 q += 4;
4895 bo = 1;
4896 }
4897 #else
4898 if (bom == 0x0000FEFF) {
4899 q += 4;
4900 bo = 1;
4901 }
4902 else if (bom == 0xFFFE0000) {
4903 q += 4;
4904 bo = -1;
4905 }
4906 #endif
4907 }
4908 }
4909
4910 if (bo == -1) {
4911 /* force LE */
4912 iorder[0] = 0;
4913 iorder[1] = 1;
4914 iorder[2] = 2;
4915 iorder[3] = 3;
4916 }
4917 else if (bo == 1) {
4918 /* force BE */
4919 iorder[0] = 3;
4920 iorder[1] = 2;
4921 iorder[2] = 1;
4922 iorder[3] = 0;
4923 }
4924
4925 /* This might be one to much, because of a BOM */
4926 unicode = PyUnicode_New((size+3)/4, 127);
4927 if (!unicode)
4928 return NULL;
4929 if (size == 0)
4930 return unicode;
4931 outpos = 0;
4932
4933 while (q < e) {
4934 Py_UCS4 ch;
4935 /* remaining bytes at the end? (size should be divisible by 4) */
4936 if (e-q<4) {
4937 if (consumed)
4938 break;
4939 errmsg = "truncated data";
4940 startinpos = ((const char *)q)-starts;
4941 endinpos = ((const char *)e)-starts;
4942 goto utf32Error;
4943 /* The remaining input chars are ignored if the callback
4944 chooses to skip the input */
4945 }
4946 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4947 (q[iorder[1]] << 8) | q[iorder[0]];
4948
4949 if (ch >= 0x110000)
4950 {
4951 errmsg = "codepoint not in range(0x110000)";
4952 startinpos = ((const char *)q)-starts;
4953 endinpos = startinpos+4;
4954 goto utf32Error;
4955 }
4956 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4957 goto onError;
4958 q += 4;
4959 continue;
4960 utf32Error:
4961 if (unicode_decode_call_errorhandler(
4962 errors, &errorHandler,
4963 "utf32", errmsg,
4964 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4965 &unicode, &outpos))
4966 goto onError;
4967 }
4968
4969 if (byteorder)
4970 *byteorder = bo;
4971
4972 if (consumed)
4973 *consumed = (const char *)q-starts;
4974
4975 /* Adjust length */
4976 if (PyUnicode_Resize(&unicode, outpos) < 0)
4977 goto onError;
4978
4979 Py_XDECREF(errorHandler);
4980 Py_XDECREF(exc);
4981 return unicode_result(unicode);
4982
4983 onError:
4984 Py_DECREF(unicode);
4985 Py_XDECREF(errorHandler);
4986 Py_XDECREF(exc);
4987 return NULL;
4988 }
4989
4990 PyObject *
4991 _PyUnicode_EncodeUTF32(PyObject *str,
4992 const char *errors,
4993 int byteorder)
4994 {
4995 int kind;
4996 void *data;
4997 Py_ssize_t len;
4998 PyObject *v;
4999 unsigned char *p;
5000 Py_ssize_t nsize, bytesize, i;
5001 /* Offsets from p for storing byte pairs in the right order. */
5002 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
5003 int iorder[] = {0, 1, 2, 3};
5004 #else
5005 int iorder[] = {3, 2, 1, 0};
5006 #endif
5007
5008 #define STORECHAR(CH) \
5009 do { \
5010 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5011 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5012 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5013 p[iorder[0]] = (CH) & 0xff; \
5014 p += 4; \
5015 } while(0)
5016
5017 if (!PyUnicode_Check(str)) {
5018 PyErr_BadArgument();
5019 return NULL;
5020 }
5021 if (PyUnicode_READY(str) < 0)
5022 return NULL;
5023 kind = PyUnicode_KIND(str);
5024 data = PyUnicode_DATA(str);
5025 len = PyUnicode_GET_LENGTH(str);
5026
5027 nsize = len + (byteorder == 0);
5028 bytesize = nsize * 4;
5029 if (bytesize / 4 != nsize)
5030 return PyErr_NoMemory();
5031 v = PyBytes_FromStringAndSize(NULL, bytesize);
5032 if (v == NULL)
5033 return NULL;
5034
5035 p = (unsigned char *)PyBytes_AS_STRING(v);
5036 if (byteorder == 0)
5037 STORECHAR(0xFEFF);
5038 if (len == 0)
5039 goto done;
5040
5041 if (byteorder == -1) {
5042 /* force LE */
5043 iorder[0] = 0;
5044 iorder[1] = 1;
5045 iorder[2] = 2;
5046 iorder[3] = 3;
5047 }
5048 else if (byteorder == 1) {
5049 /* force BE */
5050 iorder[0] = 3;
5051 iorder[1] = 2;
5052 iorder[2] = 1;
5053 iorder[3] = 0;
5054 }
5055
5056 for (i = 0; i < len; i++)
5057 STORECHAR(PyUnicode_READ(kind, data, i));
5058
5059 done:
5060 return v;
5061 #undef STORECHAR
5062 }
5063
5064 PyObject *
5065 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5066 Py_ssize_t size,
5067 const char *errors,
5068 int byteorder)
5069 {
5070 PyObject *result;
5071 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5072 if (tmp == NULL)
5073 return NULL;
5074 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5075 Py_DECREF(tmp);
5076 return result;
5077 }
5078
5079 PyObject *
5080 PyUnicode_AsUTF32String(PyObject *unicode)
5081 {
5082 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5083 }
5084
5085 /* --- UTF-16 Codec ------------------------------------------------------- */
5086
5087 PyObject *
5088 PyUnicode_DecodeUTF16(const char *s,
5089 Py_ssize_t size,
5090 const char *errors,
5091 int *byteorder)
5092 {
5093 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5094 }
5095
5096 /* Two masks for fast checking of whether a C 'long' may contain
5097 UTF16-encoded surrogate characters. This is an efficient heuristic,
5098 assuming that non-surrogate characters with a code point >= 0x8000 are
5099 rare in most input.
5100 FAST_CHAR_MASK is used when the input is in native byte ordering,
5101 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
5102 */
5103 #if (SIZEOF_LONG == 8)
5104 # define FAST_CHAR_MASK 0x8000800080008000L
5105 # define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5106 #elif (SIZEOF_LONG == 4)
5107 # define FAST_CHAR_MASK 0x80008000L
5108 # define SWAPPED_FAST_CHAR_MASK 0x00800080L
5109 #else
5110 # error C 'long' size should be either 4 or 8!
5111 #endif
5112
5113 PyObject *
5114 PyUnicode_DecodeUTF16Stateful(const char *s,
5115 Py_ssize_t size,
5116 const char *errors,
5117 int *byteorder,
5118 Py_ssize_t *consumed)
5119 {
5120 const char *starts = s;
5121 Py_ssize_t startinpos;
5122 Py_ssize_t endinpos;
5123 Py_ssize_t outpos;
5124 PyObject *unicode;
5125 const unsigned char *q, *e, *aligned_end;
5126 int bo = 0; /* assume native ordering by default */
5127 int native_ordering = 0;
5128 const char *errmsg = "";
5129 /* Offsets from q for retrieving byte pairs in the right order. */
5130 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
5131 int ihi = 1, ilo = 0;
5132 #else
5133 int ihi = 0, ilo = 1;
5134 #endif
5135 PyObject *errorHandler = NULL;
5136 PyObject *exc = NULL;
5137
5138 /* Note: size will always be longer than the resulting Unicode
5139 character count */
5140 unicode = PyUnicode_New(size, 127);
5141 if (!unicode)
5142 return NULL;
5143 if (size == 0)
5144 return unicode;
5145 outpos = 0;
5146
5147 q = (unsigned char *)s;
5148 e = q + size - 1;
5149
5150 if (byteorder)
5151 bo = *byteorder;
5152
5153 /* Check for BOM marks (U+FEFF) in the input and adjust current
5154 byte order setting accordingly. In native mode, the leading BOM
5155 mark is skipped, in all other modes, it is copied to the output
5156 stream as-is (giving a ZWNBSP character). */
5157 if (bo == 0) {
5158 if (size >= 2) {
5159 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
5160 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
5161 if (bom == 0xFEFF) {
5162 q += 2;
5163 bo = -1;
5164 }
5165 else if (bom == 0xFFFE) {
5166 q += 2;
5167 bo = 1;
5168 }
5169 #else
5170 if (bom == 0xFEFF) {
5171 q += 2;
5172 bo = 1;
5173 }
5174 else if (bom == 0xFFFE) {
5175 q += 2;
5176 bo = -1;
5177 }
5178 #endif
5179 }
5180 }
5181
5182 if (bo == -1) {
5183 /* force LE */
5184 ihi = 1;
5185 ilo = 0;
5186 }
5187 else if (bo == 1) {
5188 /* force BE */
5189 ihi = 0;
5190 ilo = 1;
5191 }
5192 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
5193 native_ordering = ilo < ihi;
5194 #else
5195 native_ordering = ilo > ihi;
5196 #endif
5197
5198 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5199 while (q < e) {
5200 Py_UCS4 ch;
5201 /* First check for possible aligned read of a C 'long'. Unaligned
5202 reads are more expensive, better to defer to another iteration. */
5203 if (!((size_t) q & LONG_PTR_MASK)) {
5204 /* Fast path for runs of non-surrogate chars. */
5205 register const unsigned char *_q = q;
5206 int kind = PyUnicode_KIND(unicode);
5207 void *data = PyUnicode_DATA(unicode);
5208 while (_q < aligned_end) {
5209 unsigned long block = * (unsigned long *) _q;
5210 unsigned short *pblock = (unsigned short*)&block;
5211 Py_UCS4 maxch;
5212 if (native_ordering) {
5213 /* Can use buffer directly */
5214 if (block & FAST_CHAR_MASK)
5215 break;
5216 }
5217 else {
5218 /* Need to byte-swap */
5219 unsigned char *_p = (unsigned char*)pblock;
5220 if (block & SWAPPED_FAST_CHAR_MASK)
5221 break;
5222 _p[0] = _q[1];
5223 _p[1] = _q[0];
5224 _p[2] = _q[3];
5225 _p[3] = _q[2];
5226 #if (SIZEOF_LONG == 8)
5227 _p[4] = _q[5];
5228 _p[5] = _q[4];
5229 _p[6] = _q[7];
5230 _p[7] = _q[6];
5231 #endif
5232 }
5233 maxch = Py_MAX(pblock[0], pblock[1]);
5234 #if SIZEOF_LONG == 8
5235 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5236 #endif
5237 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5238 if (unicode_widen(&unicode, maxch) < 0)
5239 goto onError;
5240 kind = PyUnicode_KIND(unicode);
5241 data = PyUnicode_DATA(unicode);
5242 }
5243 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5244 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5245 #if SIZEOF_LONG == 8
5246 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5247 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5248 #endif
5249 _q += SIZEOF_LONG;
5250 }
5251 q = _q;
5252 if (q >= e)
5253 break;
5254 }
5255 ch = (q[ihi] << 8) | q[ilo];
5256
5257 q += 2;
5258
5259 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5260 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5261 goto onError;
5262 continue;
5263 }
5264
5265 /* UTF-16 code pair: */
5266 if (q > e) {
5267 errmsg = "unexpected end of data";
5268 startinpos = (((const char *)q) - 2) - starts;
5269 endinpos = ((const char *)e) + 1 - starts;
5270 goto utf16Error;
5271 }
5272 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5273 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
5274 q += 2;
5275 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
5276 if (unicode_putchar(&unicode, &outpos,
5277 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
5278 goto onError;
5279 continue;
5280 }
5281 else {
5282 errmsg = "illegal UTF-16 surrogate";
5283 startinpos = (((const char *)q)-4)-starts;
5284 endinpos = startinpos+2;
5285 goto utf16Error;
5286 }
5287
5288 }
5289 errmsg = "illegal encoding";
5290 startinpos = (((const char *)q)-2)-starts;
5291 endinpos = startinpos+2;
5292 /* Fall through to report the error */
5293
5294 utf16Error:
5295 if (unicode_decode_call_errorhandler(
5296 errors,
5297 &errorHandler,
5298 "utf16", errmsg,
5299 &starts,
5300 (const char **)&e,
5301 &startinpos,
5302 &endinpos,
5303 &exc,
5304 (const char **)&q,
5305 &unicode,
5306 &outpos))
5307 goto onError;
5308 }
5309 /* remaining byte at the end? (size should be even) */
5310 if (e == q) {
5311 if (!consumed) {
5312 errmsg = "truncated data";
5313 startinpos = ((const char *)q) - starts;
5314 endinpos = ((const char *)e) + 1 - starts;
5315 if (unicode_decode_call_errorhandler(
5316 errors,
5317 &errorHandler,
5318 "utf16", errmsg,
5319 &starts,
5320 (const char **)&e,
5321 &startinpos,
5322 &endinpos,
5323 &exc,
5324 (const char **)&q,
5325 &unicode,
5326 &outpos))
5327 goto onError;
5328 /* The remaining input chars are ignored if the callback
5329 chooses to skip the input */
5330 }
5331 }
5332
5333 if (byteorder)
5334 *byteorder = bo;
5335
5336 if (consumed)
5337 *consumed = (const char *)q-starts;
5338
5339 /* Adjust length */
5340 if (PyUnicode_Resize(&unicode, outpos) < 0)
5341 goto onError;
5342
5343 Py_XDECREF(errorHandler);
5344 Py_XDECREF(exc);
5345 return unicode_result(unicode);
5346
5347 onError:
5348 Py_DECREF(unicode);
5349 Py_XDECREF(errorHandler);
5350 Py_XDECREF(exc);
5351 return NULL;
5352 }
5353
5354 #undef FAST_CHAR_MASK
5355 #undef SWAPPED_FAST_CHAR_MASK
5356
5357 PyObject *
5358 _PyUnicode_EncodeUTF16(PyObject *str,
5359 const char *errors,
5360 int byteorder)
5361 {
5362 int kind;
5363 void *data;
5364 Py_ssize_t len;
5365 PyObject *v;
5366 unsigned char *p;
5367 Py_ssize_t nsize, bytesize;
5368 Py_ssize_t i, pairs;
5369 /* Offsets from p for storing byte pairs in the right order. */
5370 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
5371 int ihi = 1, ilo = 0;
5372 #else
5373 int ihi = 0, ilo = 1;
5374 #endif
5375
5376 #define STORECHAR(CH) \
5377 do { \
5378 p[ihi] = ((CH) >> 8) & 0xff; \
5379 p[ilo] = (CH) & 0xff; \
5380 p += 2; \
5381 } while(0)
5382
5383 if (!PyUnicode_Check(str)) {
5384 PyErr_BadArgument();
5385 return NULL;
5386 }
5387 if (PyUnicode_READY(str) < 0)
5388 return NULL;
5389 kind = PyUnicode_KIND(str);
5390 data = PyUnicode_DATA(str);
5391 len = PyUnicode_GET_LENGTH(str);
5392
5393 pairs = 0;
5394 if (kind == PyUnicode_4BYTE_KIND)
5395 for (i = 0; i < len; i++)
5396 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5397 pairs++;
5398 /* 2 * (len + pairs + (byteorder == 0)) */
5399 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5400 return PyErr_NoMemory();
5401 nsize = len + pairs + (byteorder == 0);
5402 bytesize = nsize * 2;
5403 if (bytesize / 2 != nsize)
5404 return PyErr_NoMemory();
5405 v = PyBytes_FromStringAndSize(NULL, bytesize);
5406 if (v == NULL)
5407 return NULL;
5408
5409 p = (unsigned char *)PyBytes_AS_STRING(v);
5410 if (byteorder == 0)
5411 STORECHAR(0xFEFF);
5412 if (len == 0)
5413 goto done;
5414
5415 if (byteorder == -1) {
5416 /* force LE */
5417 ihi = 1;
5418 ilo = 0;
5419 }
5420 else if (byteorder == 1) {
5421 /* force BE */
5422 ihi = 0;
5423 ilo = 1;
5424 }
5425
5426 for (i = 0; i < len; i++) {
5427 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5428 Py_UCS4 ch2 = 0;
5429 if (ch >= 0x10000) {
5430 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5431 ch = Py_UNICODE_HIGH_SURROGATE(ch);
5432 }
5433 STORECHAR(ch);
5434 if (ch2)
5435 STORECHAR(ch2);
5436 }
5437
5438 done:
5439 return v;
5440 #undef STORECHAR
5441 }
5442
5443 PyObject *
5444 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5445 Py_ssize_t size,
5446 const char *errors,
5447 int byteorder)
5448 {
5449 PyObject *result;
5450 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5451 if (tmp == NULL)
5452 return NULL;
5453 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5454 Py_DECREF(tmp);
5455 return result;
5456 }
5457
5458 PyObject *
5459 PyUnicode_AsUTF16String(PyObject *unicode)
5460 {
5461 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5462 }
5463
5464 /* --- Unicode Escape Codec ----------------------------------------------- */
5465
5466 /* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5467 if all the escapes in the string make it still a valid ASCII string.
5468 Returns -1 if any escapes were found which cause the string to
5469 pop out of ASCII range. Otherwise returns the length of the
5470 required buffer to hold the string.
5471 */
5472 static Py_ssize_t
5473 length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5474 {
5475 const unsigned char *p = (const unsigned char *)s;
5476 const unsigned char *end = p + size;
5477 Py_ssize_t length = 0;
5478
5479 if (size < 0)
5480 return -1;
5481
5482 for (; p < end; ++p) {
5483 if (*p > 127) {
5484 /* Non-ASCII */
5485 return -1;
5486 }
5487 else if (*p != '\\') {
5488 /* Normal character */
5489 ++length;
5490 }
5491 else {
5492 /* Backslash-escape, check next char */
5493 ++p;
5494 /* Escape sequence reaches till end of string or
5495 non-ASCII follow-up. */
5496 if (p >= end || *p > 127)
5497 return -1;
5498 switch (*p) {
5499 case '\n':
5500 /* backslash + \n result in zero characters */
5501 break;
5502 case '\\': case '\'': case '\"':
5503 case 'b': case 'f': case 't':
5504 case 'n': case 'r': case 'v': case 'a':
5505 ++length;
5506 break;
5507 case '0': case '1': case '2': case '3':
5508 case '4': case '5': case '6': case '7':
5509 case 'x': case 'u': case 'U': case 'N':
5510 /* these do not guarantee ASCII characters */
5511 return -1;
5512 default:
5513 /* count the backslash + the other character */
5514 length += 2;
5515 }
5516 }
5517 }
5518 return length;
5519 }
5520
5521 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5522
5523 PyObject *
5524 PyUnicode_DecodeUnicodeEscape(const char *s,
5525 Py_ssize_t size,
5526 const char *errors)
5527 {
5528 const char *starts = s;
5529 Py_ssize_t startinpos;
5530 Py_ssize_t endinpos;
5531 int j;
5532 PyObject *v;
5533 const char *end;
5534 char* message;
5535 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5536 PyObject *errorHandler = NULL;
5537 PyObject *exc = NULL;
5538 Py_ssize_t len;
5539 Py_ssize_t i;
5540
5541 len = length_of_escaped_ascii_string(s, size);
5542
5543 /* After length_of_escaped_ascii_string() there are two alternatives,
5544 either the string is pure ASCII with named escapes like \n, etc.
5545 and we determined it's exact size (common case)
5546 or it contains \x, \u, ... escape sequences. then we create a
5547 legacy wchar string and resize it at the end of this function. */
5548 if (len >= 0) {
5549 v = PyUnicode_New(len, 127);
5550 if (!v)
5551 goto onError;
5552 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5553 }
5554 else {
5555 /* Escaped strings will always be longer than the resulting
5556 Unicode string, so we start with size here and then reduce the
5557 length after conversion to the true value.
5558 (but if the error callback returns a long replacement string
5559 we'll have to allocate more space) */
5560 v = PyUnicode_New(size, 127);
5561 if (!v)
5562 goto onError;
5563 len = size;
5564 }
5565
5566 if (size == 0)
5567 return v;
5568 i = 0;
5569 end = s + size;
5570
5571 while (s < end) {
5572 unsigned char c;
5573 Py_UCS4 x;
5574 int digits;
5575
5576 /* The only case in which i == ascii_length is a backslash
5577 followed by a newline. */
5578 assert(i <= len);
5579
5580 /* Non-escape characters are interpreted as Unicode ordinals */
5581 if (*s != '\\') {
5582 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5583 goto onError;
5584 continue;
5585 }
5586
5587 startinpos = s-starts;
5588 /* \ - Escapes */
5589 s++;
5590 c = *s++;
5591 if (s > end)
5592 c = '\0'; /* Invalid after \ */
5593
5594 /* The only case in which i == ascii_length is a backslash
5595 followed by a newline. */
5596 assert(i < len || (i == len && c == '\n'));
5597
5598 switch (c) {
5599
5600 /* \x escapes */
5601 #define WRITECHAR(ch) \
5602 do { \
5603 if (unicode_putchar(&v, &i, ch) < 0) \
5604 goto onError; \
5605 }while(0)
5606
5607 case '\n': break;
5608 case '\\': WRITECHAR('\\'); break;
5609 case '\'': WRITECHAR('\''); break;
5610 case '\"': WRITECHAR('\"'); break;
5611 case 'b': WRITECHAR('\b'); break;
5612 /* FF */
5613 case 'f': WRITECHAR('\014'); break;
5614 case 't': WRITECHAR('\t'); break;
5615 case 'n': WRITECHAR('\n'); break;
5616 case 'r': WRITECHAR('\r'); break;
5617 /* VT */
5618 case 'v': WRITECHAR('\013'); break;
5619 /* BEL, not classic C */
5620 case 'a': WRITECHAR('\007'); break;
5621
5622 /* \OOO (octal) escapes */
5623 case '0': case '1': case '2': case '3':
5624 case '4': case '5': case '6': case '7':
5625 x = s[-1] - '0';
5626 if (s < end && '0' <= *s && *s <= '7') {
5627 x = (x<<3) + *s++ - '0';
5628 if (s < end && '0' <= *s && *s <= '7')
5629 x = (x<<3) + *s++ - '0';
5630 }
5631 WRITECHAR(x);
5632 break;
5633
5634 /* hex escapes */
5635 /* \xXX */
5636 case 'x':
5637 digits = 2;
5638 message = "truncated \\xXX escape";
5639 goto hexescape;
5640
5641 /* \uXXXX */
5642 case 'u':
5643 digits = 4;
5644 message = "truncated \\uXXXX escape";
5645 goto hexescape;
5646
5647 /* \UXXXXXXXX */
5648 case 'U':
5649 digits = 8;
5650 message = "truncated \\UXXXXXXXX escape";
5651 hexescape:
5652 chr = 0;
5653 if (s+digits>end) {
5654 endinpos = size;
5655 if (unicode_decode_call_errorhandler(
5656 errors, &errorHandler,
5657 "unicodeescape", "end of string in escape sequence",
5658 &starts, &end, &startinpos, &endinpos, &exc, &s,
5659 &v, &i))
5660 goto onError;
5661 goto nextByte;
5662 }
5663 for (j = 0; j < digits; ++j) {
5664 c = (unsigned char) s[j];
5665 if (!Py_ISXDIGIT(c)) {
5666 endinpos = (s+j+1)-starts;
5667 if (unicode_decode_call_errorhandler(
5668 errors, &errorHandler,
5669 "unicodeescape", message,
5670 &starts, &end, &startinpos, &endinpos, &exc, &s,
5671 &v, &i))
5672 goto onError;
5673 len = PyUnicode_GET_LENGTH(v);
5674 goto nextByte;
5675 }
5676 chr = (chr<<4) & ~0xF;
5677 if (c >= '0' && c <= '9')
5678 chr += c - '0';
5679 else if (c >= 'a' && c <= 'f')
5680 chr += 10 + c - 'a';
5681 else
5682 chr += 10 + c - 'A';
5683 }
5684 s += j;
5685 if (chr == 0xffffffff && PyErr_Occurred())
5686 /* _decoding_error will have already written into the
5687 target buffer. */
5688 break;
5689 store:
5690 /* when we get here, chr is a 32-bit unicode character */
5691 if (chr <= MAX_UNICODE) {
5692 WRITECHAR(chr);
5693 } else {
5694 endinpos = s-starts;
5695 if (unicode_decode_call_errorhandler(
5696 errors, &errorHandler,
5697 "unicodeescape", "illegal Unicode character",
5698 &starts, &end, &startinpos, &endinpos, &exc, &s,
5699 &v, &i))
5700 goto onError;
5701 }
5702 break;
5703
5704 /* \N{name} */
5705 case 'N':
5706 message = "malformed \\N character escape";
5707 if (ucnhash_CAPI == NULL) {
5708 /* load the unicode data module */
5709 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5710 PyUnicodeData_CAPSULE_NAME, 1);
5711 if (ucnhash_CAPI == NULL)
5712 goto ucnhashError;
5713 }
5714 if (*s == '{') {
5715 const char *start = s+1;
5716 /* look for the closing brace */
5717 while (*s != '}' && s < end)
5718 s++;
5719 if (s > start && s < end && *s == '}') {
5720 /* found a name. look it up in the unicode database */
5721 message = "unknown Unicode character name";
5722 s++;
5723 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5724 &chr, 0))
5725 goto store;
5726 }
5727 }
5728 endinpos = s-starts;
5729 if (unicode_decode_call_errorhandler(
5730 errors, &errorHandler,
5731 "unicodeescape", message,
5732 &starts, &end, &startinpos, &endinpos, &exc, &s,
5733 &v, &i))
5734 goto onError;
5735 break;
5736
5737 default:
5738 if (s > end) {
5739 message = "\\ at end of string";
5740 s--;
5741 endinpos = s-starts;
5742 if (unicode_decode_call_errorhandler(
5743 errors, &errorHandler,
5744 "unicodeescape", message,
5745 &starts, &end, &startinpos, &endinpos, &exc, &s,
5746 &v, &i))
5747 goto onError;
5748 }
5749 else {
5750 WRITECHAR('\\');
5751 WRITECHAR(s[-1]);
5752 }
5753 break;
5754 }
5755 nextByte:
5756 ;
5757 }
5758 #undef WRITECHAR
5759
5760 if (PyUnicode_Resize(&v, i) < 0)
5761 goto onError;
5762 Py_XDECREF(errorHandler);
5763 Py_XDECREF(exc);
5764 return unicode_result(v);
5765
5766 ucnhashError:
5767 PyErr_SetString(
5768 PyExc_UnicodeError,
5769 "\\N escapes not supported (can't load unicodedata module)"
5770 );
5771 Py_XDECREF(v);
5772 Py_XDECREF(errorHandler);
5773 Py_XDECREF(exc);
5774 return NULL;
5775
5776 onError:
5777 Py_XDECREF(v);
5778 Py_XDECREF(errorHandler);
5779 Py_XDECREF(exc);
5780 return NULL;
5781 }
5782
5783 /* Return a Unicode-Escape string version of the Unicode object.
5784
5785 If quotes is true, the string is enclosed in u"" or u'' quotes as
5786 appropriate.
5787
5788 */
5789
5790 PyObject *
5791 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5792 {
5793 Py_ssize_t i, len;
5794 PyObject *repr;
5795 char *p;
5796 int kind;
5797 void *data;
5798 Py_ssize_t expandsize = 0;
5799
5800 /* Initial allocation is based on the longest-possible unichr
5801 escape.
5802
5803 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5804 unichr, so in this case it's the longest unichr escape. In
5805 narrow (UTF-16) builds this is five chars per source unichr
5806 since there are two unichrs in the surrogate pair, so in narrow
5807 (UTF-16) builds it's not the longest unichr escape.
5808
5809 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5810 so in the narrow (UTF-16) build case it's the longest unichr
5811 escape.
5812 */
5813
5814 if (!PyUnicode_Check(unicode)) {
5815 PyErr_BadArgument();
5816 return NULL;
5817 }
5818 if (PyUnicode_READY(unicode) < 0)
5819 return NULL;
5820 len = PyUnicode_GET_LENGTH(unicode);
5821 kind = PyUnicode_KIND(unicode);
5822 data = PyUnicode_DATA(unicode);
5823 switch(kind) {
5824 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5825 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5826 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5827 }
5828
5829 if (len == 0)
5830 return PyBytes_FromStringAndSize(NULL, 0);
5831
5832 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5833 return PyErr_NoMemory();
5834
5835 repr = PyBytes_FromStringAndSize(NULL,
5836 2
5837 + expandsize*len
5838 + 1);
5839 if (repr == NULL)
5840 return NULL;
5841
5842 p = PyBytes_AS_STRING(repr);
5843
5844 for (i = 0; i < len; i++) {
5845 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5846
5847 /* Escape backslashes */
5848 if (ch == '\\') {
5849 *p++ = '\\';
5850 *p++ = (char) ch;
5851 continue;
5852 }
5853
5854 /* Map 21-bit characters to '\U00xxxxxx' */
5855 else if (ch >= 0x10000) {
5856 assert(ch <= MAX_UNICODE);
5857 *p++ = '\\';
5858 *p++ = 'U';
5859 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5860 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5861 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5862 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5863 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5864 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5865 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5866 *p++ = Py_hexdigits[ch & 0x0000000F];
5867 continue;
5868 }
5869
5870 /* Map 16-bit characters to '\uxxxx' */
5871 if (ch >= 256) {
5872 *p++ = '\\';
5873 *p++ = 'u';
5874 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5875 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5876 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5877 *p++ = Py_hexdigits[ch & 0x000F];
5878 }
5879
5880 /* Map special whitespace to '\t', \n', '\r' */
5881 else if (ch == '\t') {
5882 *p++ = '\\';
5883 *p++ = 't';
5884 }
5885 else if (ch == '\n') {
5886 *p++ = '\\';
5887 *p++ = 'n';
5888 }
5889 else if (ch == '\r') {
5890 *p++ = '\\';
5891 *p++ = 'r';
5892 }
5893
5894 /* Map non-printable US ASCII to '\xhh' */
5895 else if (ch < ' ' || ch >= 0x7F) {
5896 *p++ = '\\';
5897 *p++ = 'x';
5898 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5899 *p++ = Py_hexdigits[ch & 0x000F];
5900 }
5901
5902 /* Copy everything else as-is */
5903 else
5904 *p++ = (char) ch;
5905 }
5906
5907 assert(p - PyBytes_AS_STRING(repr) > 0);
5908 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5909 return NULL;
5910 return repr;
5911 }
5912
5913 PyObject *
5914 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5915 Py_ssize_t size)
5916 {
5917 PyObject *result;
5918 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5919 if (tmp == NULL)
5920 return NULL;
5921 result = PyUnicode_AsUnicodeEscapeString(tmp);
5922 Py_DECREF(tmp);
5923 return result;
5924 }
5925
5926 /* --- Raw Unicode Escape Codec ------------------------------------------- */
5927
5928 PyObject *
5929 PyUnicode_DecodeRawUnicodeEscape(const char *s,
5930 Py_ssize_t size,
5931 const char *errors)
5932 {
5933 const char *starts = s;
5934 Py_ssize_t startinpos;
5935 Py_ssize_t endinpos;
5936 Py_ssize_t outpos;
5937 PyObject *v;
5938 const char *end;
5939 const char *bs;
5940 PyObject *errorHandler = NULL;
5941 PyObject *exc = NULL;
5942
5943 /* Escaped strings will always be longer than the resulting
5944 Unicode string, so we start with size here and then reduce the
5945 length after conversion to the true value. (But decoding error
5946 handler might have to resize the string) */
5947 v = PyUnicode_New(size, 127);
5948 if (v == NULL)
5949 goto onError;
5950 if (size == 0)
5951 return v;
5952 outpos = 0;
5953 end = s + size;
5954 while (s < end) {
5955 unsigned char c;
5956 Py_UCS4 x;
5957 int i;
5958 int count;
5959
5960 /* Non-escape characters are interpreted as Unicode ordinals */
5961 if (*s != '\\') {
5962 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5963 goto onError;
5964 continue;
5965 }
5966 startinpos = s-starts;
5967
5968 /* \u-escapes are only interpreted iff the number of leading
5969 backslashes if odd */
5970 bs = s;
5971 for (;s < end;) {
5972 if (*s != '\\')
5973 break;
5974 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5975 goto onError;
5976 }
5977 if (((s - bs) & 1) == 0 ||
5978 s >= end ||
5979 (*s != 'u' && *s != 'U')) {
5980 continue;
5981 }
5982 outpos--;
5983 count = *s=='u' ? 4 : 8;
5984 s++;
5985
5986 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5987 for (x = 0, i = 0; i < count; ++i, ++s) {
5988 c = (unsigned char)*s;
5989 if (!Py_ISXDIGIT(c)) {
5990 endinpos = s-starts;
5991 if (unicode_decode_call_errorhandler(
5992 errors, &errorHandler,
5993 "rawunicodeescape", "truncated \\uXXXX",
5994 &starts, &end, &startinpos, &endinpos, &exc, &s,
5995 &v, &outpos))
5996 goto onError;
5997 goto nextByte;
5998 }
5999 x = (x<<4) & ~0xF;
6000 if (c >= '0' && c <= '9')
6001 x += c - '0';
6002 else if (c >= 'a' && c <= 'f')
6003 x += 10 + c - 'a';
6004 else
6005 x += 10 + c - 'A';
6006 }
6007 if (x <= MAX_UNICODE) {
6008 if (unicode_putchar(&v, &outpos, x) < 0)
6009 goto onError;
6010 } else {
6011 endinpos = s-starts;
6012 if (unicode_decode_call_errorhandler(
6013 errors, &errorHandler,
6014 "rawunicodeescape", "\\Uxxxxxxxx out of range",
6015 &starts, &end, &startinpos, &endinpos, &exc, &s,
6016 &v, &outpos))
6017 goto onError;
6018 }
6019 nextByte:
6020 ;
6021 }
6022 if (PyUnicode_Resize(&v, outpos) < 0)
6023 goto onError;
6024 Py_XDECREF(errorHandler);
6025 Py_XDECREF(exc);
6026 return unicode_result(v);
6027
6028 onError:
6029 Py_XDECREF(v);
6030 Py_XDECREF(errorHandler);
6031 Py_XDECREF(exc);
6032 return NULL;
6033 }
6034
6035
6036 PyObject *
6037 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6038 {
6039 PyObject *repr;
6040 char *p;
6041 char *q;
6042 Py_ssize_t expandsize, pos;
6043 int kind;
6044 void *data;
6045 Py_ssize_t len;
6046
6047 if (!PyUnicode_Check(unicode)) {
6048 PyErr_BadArgument();
6049 return NULL;
6050 }
6051 if (PyUnicode_READY(unicode) < 0)
6052 return NULL;
6053 kind = PyUnicode_KIND(unicode);
6054 data = PyUnicode_DATA(unicode);
6055 len = PyUnicode_GET_LENGTH(unicode);
6056 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6057 bytes, and 1 byte characters 4. */
6058 expandsize = kind * 2 + 2;
6059
6060 if (len > PY_SSIZE_T_MAX / expandsize)
6061 return PyErr_NoMemory();
6062
6063 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6064 if (repr == NULL)
6065 return NULL;
6066 if (len == 0)
6067 return repr;
6068
6069 p = q = PyBytes_AS_STRING(repr);
6070 for (pos = 0; pos < len; pos++) {
6071 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6072 /* Map 32-bit characters to '\Uxxxxxxxx' */
6073 if (ch >= 0x10000) {
6074 assert(ch <= MAX_UNICODE);
6075 *p++ = '\\';
6076 *p++ = 'U';
6077 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6078 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6079 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6080 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6081 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6082 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6083 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6084 *p++ = Py_hexdigits[ch & 15];
6085 }
6086 /* Map 16-bit characters to '\uxxxx' */
6087 else if (ch >= 256) {
6088 *p++ = '\\';
6089 *p++ = 'u';
6090 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6091 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6092 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6093 *p++ = Py_hexdigits[ch & 15];
6094 }
6095 /* Copy everything else as-is */
6096 else
6097 *p++ = (char) ch;
6098 }
6099
6100 assert(p > q);
6101 if (_PyBytes_Resize(&repr, p - q) < 0)
6102 return NULL;
6103 return repr;
6104 }
6105
6106 PyObject *
6107 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6108 Py_ssize_t size)
6109 {
6110 PyObject *result;
6111 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6112 if (tmp == NULL)
6113 return NULL;
6114 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6115 Py_DECREF(tmp);
6116 return result;
6117 }
6118
6119 /* --- Unicode Internal Codec ------------------------------------------- */
6120
6121 PyObject *
6122 _PyUnicode_DecodeUnicodeInternal(const char *s,
6123 Py_ssize_t size,
6124 const char *errors)
6125 {
6126 const char *starts = s;
6127 Py_ssize_t startinpos;
6128 Py_ssize_t endinpos;
6129 Py_ssize_t outpos;
6130 PyObject *v;
6131 const char *end;
6132 const char *reason;
6133 PyObject *errorHandler = NULL;
6134 PyObject *exc = NULL;
6135
6136 if (PyErr_WarnEx(PyExc_DeprecationWarning,
6137 "unicode_internal codec has been deprecated",
6138 1))
6139 return NULL;
6140
6141 /* XXX overflow detection missing */
6142 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
6143 if (v == NULL)
6144 goto onError;
6145 if (PyUnicode_GET_LENGTH(v) == 0)
6146 return v;
6147 outpos = 0;
6148 end = s + size;
6149
6150 while (s < end) {
6151 Py_UNICODE uch;
6152 Py_UCS4 ch;
6153 /* We copy the raw representation one byte at a time because the
6154 pointer may be unaligned (see test_codeccallbacks). */
6155 ((char *) &uch)[0] = s[0];
6156 ((char *) &uch)[1] = s[1];
6157 #ifdef Py_UNICODE_WIDE
6158 ((char *) &uch)[2] = s[2];
6159 ((char *) &uch)[3] = s[3];
6160 #endif
6161 ch = uch;
6162
6163 /* We have to sanity check the raw data, otherwise doom looms for
6164 some malformed UCS-4 data. */
6165 if (
6166 #ifdef Py_UNICODE_WIDE
6167 ch > 0x10ffff ||
6168 #endif
6169 end-s < Py_UNICODE_SIZE
6170 )
6171 {
6172 startinpos = s - starts;
6173 if (end-s < Py_UNICODE_SIZE) {
6174 endinpos = end-starts;
6175 reason = "truncated input";
6176 }
6177 else {
6178 endinpos = s - starts + Py_UNICODE_SIZE;
6179 reason = "illegal code point (> 0x10FFFF)";
6180 }
6181 if (unicode_decode_call_errorhandler(
6182 errors, &errorHandler,
6183 "unicode_internal", reason,
6184 &starts, &end, &startinpos, &endinpos, &exc, &s,
6185 &v, &outpos))
6186 goto onError;
6187 continue;
6188 }
6189
6190 s += Py_UNICODE_SIZE;
6191 #ifndef Py_UNICODE_WIDE
6192 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
6193 {
6194 Py_UNICODE uch2;
6195 ((char *) &uch2)[0] = s[0];
6196 ((char *) &uch2)[1] = s[1];
6197 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6198 {
6199 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6200 s += Py_UNICODE_SIZE;
6201 }
6202 }
6203 #endif
6204
6205 if (unicode_putchar(&v, &outpos, ch) < 0)
6206 goto onError;
6207 }
6208
6209 if (PyUnicode_Resize(&v, outpos) < 0)
6210 goto onError;
6211 Py_XDECREF(errorHandler);
6212 Py_XDECREF(exc);
6213 return unicode_result(v);
6214
6215 onError:
6216 Py_XDECREF(v);
6217 Py_XDECREF(errorHandler);
6218 Py_XDECREF(exc);
6219 return NULL;
6220 }
6221
6222 /* --- Latin-1 Codec ------------------------------------------------------ */
6223
6224 PyObject *
6225 PyUnicode_DecodeLatin1(const char *s,
6226 Py_ssize_t size,
6227 const char *errors)
6228 {
6229 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6230 return _PyUnicode_FromUCS1((unsigned char*)s, size);
6231 }
6232
6233 /* create or adjust a UnicodeEncodeError */
6234 static void
6235 make_encode_exception(PyObject **exceptionObject,
6236 const char *encoding,
6237 PyObject *unicode,
6238 Py_ssize_t startpos, Py_ssize_t endpos,
6239 const char *reason)
6240 {
6241 if (*exceptionObject == NULL) {
6242 *exceptionObject = PyObject_CallFunction(
6243 PyExc_UnicodeEncodeError, "sOnns",
6244 encoding, unicode, startpos, endpos, reason);
6245 }
6246 else {
6247 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6248 goto onError;
6249 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6250 goto onError;
6251 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6252 goto onError;
6253 return;
6254 onError:
6255 Py_DECREF(*exceptionObject);
6256 *exceptionObject = NULL;
6257 }
6258 }
6259
6260 /* raises a UnicodeEncodeError */
6261 static void
6262 raise_encode_exception(PyObject **exceptionObject,
6263 const char *encoding,
6264 PyObject *unicode,
6265 Py_ssize_t startpos, Py_ssize_t endpos,
6266 const char *reason)
6267 {
6268 make_encode_exception(exceptionObject,
6269 encoding, unicode, startpos, endpos, reason);
6270 if (*exceptionObject != NULL)
6271 PyCodec_StrictErrors(*exceptionObject);
6272 }
6273
6274 /* error handling callback helper:
6275 build arguments, call the callback and check the arguments,
6276 put the result into newpos and return the replacement string, which
6277 has to be freed by the caller */
6278 static PyObject *
6279 unicode_encode_call_errorhandler(const char *errors,
6280 PyObject **errorHandler,
6281 const char *encoding, const char *reason,
6282 PyObject *unicode, PyObject **exceptionObject,
6283 Py_ssize_t startpos, Py_ssize_t endpos,
6284 Py_ssize_t *newpos)
6285 {
6286 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6287 Py_ssize_t len;
6288 PyObject *restuple;
6289 PyObject *resunicode;
6290
6291 if (*errorHandler == NULL) {
6292 *errorHandler = PyCodec_LookupError(errors);
6293 if (*errorHandler == NULL)
6294 return NULL;
6295 }
6296
6297 if (PyUnicode_READY(unicode) < 0)
6298 return NULL;
6299 len = PyUnicode_GET_LENGTH(unicode);
6300
6301 make_encode_exception(exceptionObject,
6302 encoding, unicode, startpos, endpos, reason);
6303 if (*exceptionObject == NULL)
6304 return NULL;
6305
6306 restuple = PyObject_CallFunctionObjArgs(
6307 *errorHandler, *exceptionObject, NULL);
6308 if (restuple == NULL)
6309 return NULL;
6310 if (!PyTuple_Check(restuple)) {
6311 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6312 Py_DECREF(restuple);
6313 return NULL;
6314 }
6315 if (!PyArg_ParseTuple(restuple, argparse,
6316 &resunicode, newpos)) {
6317 Py_DECREF(restuple);
6318 return NULL;
6319 }
6320 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6321 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6322 Py_DECREF(restuple);
6323 return NULL;
6324 }
6325 if (*newpos<0)
6326 *newpos = len + *newpos;
6327 if (*newpos<0 || *newpos>len) {
6328 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6329 Py_DECREF(restuple);
6330 return NULL;
6331 }
6332 Py_INCREF(resunicode);
6333 Py_DECREF(restuple);
6334 return resunicode;
6335 }
6336
6337 static PyObject *
6338 unicode_encode_ucs1(PyObject *unicode,
6339 const char *errors,
6340 unsigned int limit)
6341 {
6342 /* input state */
6343 Py_ssize_t pos=0, size;
6344 int kind;
6345 void *data;
6346 /* output object */
6347 PyObject *res;
6348 /* pointer into the output */
6349 char *str;
6350 /* current output position */
6351 Py_ssize_t ressize;
6352 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6353 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6354 PyObject *errorHandler = NULL;
6355 PyObject *exc = NULL;
6356 /* the following variable is used for caching string comparisons
6357 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6358 int known_errorHandler = -1;
6359
6360 if (PyUnicode_READY(unicode) < 0)
6361 return NULL;
6362 size = PyUnicode_GET_LENGTH(unicode);
6363 kind = PyUnicode_KIND(unicode);
6364 data = PyUnicode_DATA(unicode);
6365 /* allocate enough for a simple encoding without
6366 replacements, if we need more, we'll resize */
6367 if (size == 0)
6368 return PyBytes_FromStringAndSize(NULL, 0);
6369 res = PyBytes_FromStringAndSize(NULL, size);
6370 if (res == NULL)
6371 return NULL;
6372 str = PyBytes_AS_STRING(res);
6373 ressize = size;
6374
6375 while (pos < size) {
6376 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6377
6378 /* can we encode this? */
6379 if (c<limit) {
6380 /* no overflow check, because we know that the space is enough */
6381 *str++ = (char)c;
6382 ++pos;
6383 }
6384 else {
6385 Py_ssize_t requiredsize;
6386 PyObject *repunicode;
6387 Py_ssize_t repsize, newpos, respos, i;
6388 /* startpos for collecting unencodable chars */
6389 Py_ssize_t collstart = pos;
6390 Py_ssize_t collend = pos;
6391 /* find all unecodable characters */
6392 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6393 ++collend;
6394 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6395 if (known_errorHandler==-1) {
6396 if ((errors==NULL) || (!strcmp(errors, "strict")))
6397 known_errorHandler = 1;
6398 else if (!strcmp(errors, "replace"))
6399 known_errorHandler = 2;
6400 else if (!strcmp(errors, "ignore"))
6401 known_errorHandler = 3;
6402 else if (!strcmp(errors, "xmlcharrefreplace"))
6403 known_errorHandler = 4;
6404 else
6405 known_errorHandler = 0;
6406 }
6407 switch (known_errorHandler) {
6408 case 1: /* strict */
6409 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6410 goto onError;
6411 case 2: /* replace */
6412 while (collstart++<collend)
6413 *str++ = '?'; /* fall through */
6414 case 3: /* ignore */
6415 pos = collend;
6416 break;
6417 case 4: /* xmlcharrefreplace */
6418 respos = str - PyBytes_AS_STRING(res);
6419 /* determine replacement size */
6420 for (i = collstart, repsize = 0; i < collend; ++i) {
6421 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6422 if (ch < 10)
6423 repsize += 2+1+1;
6424 else if (ch < 100)
6425 repsize += 2+2+1;
6426 else if (ch < 1000)
6427 repsize += 2+3+1;
6428 else if (ch < 10000)
6429 repsize += 2+4+1;
6430 else if (ch < 100000)
6431 repsize += 2+5+1;
6432 else if (ch < 1000000)
6433 repsize += 2+6+1;
6434 else {
6435 assert(ch <= MAX_UNICODE);
6436 repsize += 2+7+1;
6437 }
6438 }
6439 requiredsize = respos+repsize+(size-collend);
6440 if (requiredsize > ressize) {
6441 if (requiredsize<2*ressize)
6442 requiredsize = 2*ressize;
6443 if (_PyBytes_Resize(&res, requiredsize))
6444 goto onError;
6445 str = PyBytes_AS_STRING(res) + respos;
6446 ressize = requiredsize;
6447 }
6448 /* generate replacement */
6449 for (i = collstart; i < collend; ++i) {
6450 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6451 }
6452 pos = collend;
6453 break;
6454 default:
6455 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6456 encoding, reason, unicode, &exc,
6457 collstart, collend, &newpos);
6458 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6459 PyUnicode_READY(repunicode) < 0))
6460 goto onError;
6461 if (PyBytes_Check(repunicode)) {
6462 /* Directly copy bytes result to output. */
6463 repsize = PyBytes_Size(repunicode);
6464 if (repsize > 1) {
6465 /* Make room for all additional bytes. */
6466 respos = str - PyBytes_AS_STRING(res);
6467 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6468 Py_DECREF(repunicode);
6469 goto onError;
6470 }
6471 str = PyBytes_AS_STRING(res) + respos;
6472 ressize += repsize-1;
6473 }
6474 memcpy(str, PyBytes_AsString(repunicode), repsize);
6475 str += repsize;
6476 pos = newpos;
6477 Py_DECREF(repunicode);
6478 break;
6479 }
6480 /* need more space? (at least enough for what we
6481 have+the replacement+the rest of the string, so
6482 we won't have to check space for encodable characters) */
6483 respos = str - PyBytes_AS_STRING(res);
6484 repsize = PyUnicode_GET_LENGTH(repunicode);
6485 requiredsize = respos+repsize+(size-collend);
6486 if (requiredsize > ressize) {
6487 if (requiredsize<2*ressize)
6488 requiredsize = 2*ressize;
6489 if (_PyBytes_Resize(&res, requiredsize)) {
6490 Py_DECREF(repunicode);
6491 goto onError;
6492 }
6493 str = PyBytes_AS_STRING(res) + respos;
6494 ressize = requiredsize;
6495 }
6496 /* check if there is anything unencodable in the replacement
6497 and copy it to the output */
6498 for (i = 0; repsize-->0; ++i, ++str) {
6499 c = PyUnicode_READ_CHAR(repunicode, i);
6500 if (c >= limit) {
6501 raise_encode_exception(&exc, encoding, unicode,
6502 pos, pos+1, reason);
6503 Py_DECREF(repunicode);
6504 goto onError;
6505 }
6506 *str = (char)c;
6507 }
6508 pos = newpos;
6509 Py_DECREF(repunicode);
6510 }
6511 }
6512 }
6513 /* Resize if we allocated to much */
6514 size = str - PyBytes_AS_STRING(res);
6515 if (size < ressize) { /* If this falls res will be NULL */
6516 assert(size >= 0);
6517 if (_PyBytes_Resize(&res, size) < 0)
6518 goto onError;
6519 }
6520
6521 Py_XDECREF(errorHandler);
6522 Py_XDECREF(exc);
6523 return res;
6524
6525 onError:
6526 Py_XDECREF(res);
6527 Py_XDECREF(errorHandler);
6528 Py_XDECREF(exc);
6529 return NULL;
6530 }
6531
6532 /* Deprecated */
6533 PyObject *
6534 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6535 Py_ssize_t size,
6536 const char *errors)
6537 {
6538 PyObject *result;
6539 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6540 if (unicode == NULL)
6541 return NULL;
6542 result = unicode_encode_ucs1(unicode, errors, 256);
6543 Py_DECREF(unicode);
6544 return result;
6545 }
6546
6547 PyObject *
6548 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6549 {
6550 if (!PyUnicode_Check(unicode)) {
6551 PyErr_BadArgument();
6552 return NULL;
6553 }
6554 if (PyUnicode_READY(unicode) == -1)
6555 return NULL;
6556 /* Fast path: if it is a one-byte string, construct
6557 bytes object directly. */
6558 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6559 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6560 PyUnicode_GET_LENGTH(unicode));
6561 /* Non-Latin-1 characters present. Defer to above function to
6562 raise the exception. */
6563 return unicode_encode_ucs1(unicode, errors, 256);
6564 }
6565
6566 PyObject*
6567 PyUnicode_AsLatin1String(PyObject *unicode)
6568 {
6569 return _PyUnicode_AsLatin1String(unicode, NULL);
6570 }
6571
6572 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6573
6574 PyObject *
6575 PyUnicode_DecodeASCII(const char *s,
6576 Py_ssize_t size,
6577 const char *errors)
6578 {
6579 const char *starts = s;
6580 PyObject *v;
6581 int kind;
6582 void *data;
6583 Py_ssize_t startinpos;
6584 Py_ssize_t endinpos;
6585 Py_ssize_t outpos;
6586 const char *e;
6587 int has_error;
6588 const unsigned char *p = (const unsigned char *)s;
6589 const unsigned char *end = p + size;
6590 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
6591 PyObject *errorHandler = NULL;
6592 PyObject *exc = NULL;
6593
6594 if (size == 0) {
6595 Py_INCREF(unicode_empty);
6596 return unicode_empty;
6597 }
6598
6599 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6600 if (size == 1 && (unsigned char)s[0] < 128)
6601 return get_latin1_char((unsigned char)s[0]);
6602
6603 has_error = 0;
6604 while (p < end && !has_error) {
6605 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6606 an explanation. */
6607 if (!((size_t) p & LONG_PTR_MASK)) {
6608 /* Help register allocation */
6609 register const unsigned char *_p = p;
6610 while (_p < aligned_end) {
6611 unsigned long value = *(unsigned long *) _p;
6612 if (value & ASCII_CHAR_MASK) {
6613 has_error = 1;
6614 break;
6615 }
6616 _p += SIZEOF_LONG;
6617 }
6618 if (_p == end)
6619 break;
6620 if (has_error)
6621 break;
6622 p = _p;
6623 }
6624 if (*p & 0x80) {
6625 has_error = 1;
6626 break;
6627 }
6628 else {
6629 ++p;
6630 }
6631 }
6632 if (!has_error)
6633 return unicode_fromascii((const unsigned char *)s, size);
6634
6635 v = PyUnicode_New(size, 127);
6636 if (v == NULL)
6637 goto onError;
6638 if (size == 0)
6639 return v;
6640 kind = PyUnicode_KIND(v);
6641 data = PyUnicode_DATA(v);
6642 outpos = 0;
6643 e = s + size;
6644 while (s < e) {
6645 register unsigned char c = (unsigned char)*s;
6646 if (c < 128) {
6647 PyUnicode_WRITE(kind, data, outpos++, c);
6648 ++s;
6649 }
6650 else {
6651 startinpos = s-starts;
6652 endinpos = startinpos + 1;
6653 if (unicode_decode_call_errorhandler(
6654 errors, &errorHandler,
6655 "ascii", "ordinal not in range(128)",
6656 &starts, &e, &startinpos, &endinpos, &exc, &s,
6657 &v, &outpos))
6658 goto onError;
6659 kind = PyUnicode_KIND(v);
6660 data = PyUnicode_DATA(v);
6661 }
6662 }
6663 if (PyUnicode_Resize(&v, outpos) < 0)
6664 goto onError;
6665 Py_XDECREF(errorHandler);
6666 Py_XDECREF(exc);
6667 assert(_PyUnicode_CheckConsistency(v, 1));
6668 return v;
6669
6670 onError:
6671 Py_XDECREF(v);
6672 Py_XDECREF(errorHandler);
6673 Py_XDECREF(exc);
6674 return NULL;
6675 }
6676
6677 /* Deprecated */
6678 PyObject *
6679 PyUnicode_EncodeASCII(const Py_UNICODE *p,
6680 Py_ssize_t size,
6681 const char *errors)
6682 {
6683 PyObject *result;
6684 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6685 if (unicode == NULL)
6686 return NULL;
6687 result = unicode_encode_ucs1(unicode, errors, 128);
6688 Py_DECREF(unicode);
6689 return result;
6690 }
6691
6692 PyObject *
6693 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6694 {
6695 if (!PyUnicode_Check(unicode)) {
6696 PyErr_BadArgument();
6697 return NULL;
6698 }
6699 if (PyUnicode_READY(unicode) == -1)
6700 return NULL;
6701 /* Fast path: if it is an ASCII-only string, construct bytes object
6702 directly. Else defer to above function to raise the exception. */
6703 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6704 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6705 PyUnicode_GET_LENGTH(unicode));
6706 return unicode_encode_ucs1(unicode, errors, 128);
6707 }
6708
6709 PyObject *
6710 PyUnicode_AsASCIIString(PyObject *unicode)
6711 {
6712 return _PyUnicode_AsASCIIString(unicode, NULL);
6713 }
6714
6715 #ifdef HAVE_MBCS
6716
6717 /* --- MBCS codecs for Windows -------------------------------------------- */
6718
6719 #if SIZEOF_INT < SIZEOF_SIZE_T
6720 #define NEED_RETRY
6721 #endif
6722
6723 #ifndef WC_ERR_INVALID_CHARS
6724 # define WC_ERR_INVALID_CHARS 0x0080
6725 #endif
6726
6727 static char*
6728 code_page_name(UINT code_page, PyObject **obj)
6729 {
6730 *obj = NULL;
6731 if (code_page == CP_ACP)
6732 return "mbcs";
6733 if (code_page == CP_UTF7)
6734 return "CP_UTF7";
6735 if (code_page == CP_UTF8)
6736 return "CP_UTF8";
6737
6738 *obj = PyBytes_FromFormat("cp%u", code_page);
6739 if (*obj == NULL)
6740 return NULL;
6741 return PyBytes_AS_STRING(*obj);
6742 }
6743
6744 static int
6745 is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6746 {
6747 const char *curr = s + offset;
6748 const char *prev;
6749
6750 if (!IsDBCSLeadByteEx(code_page, *curr))
6751 return 0;
6752
6753 prev = CharPrevExA(code_page, s, curr, 0);
6754 if (prev == curr)
6755 return 1;
6756 /* FIXME: This code is limited to "true" double-byte encodings,
6757 as it assumes an incomplete character consists of a single
6758 byte. */
6759 if (curr - prev == 2)
6760 return 1;
6761 if (!IsDBCSLeadByteEx(code_page, *prev))
6762 return 1;
6763 return 0;
6764 }
6765
6766 static DWORD
6767 decode_code_page_flags(UINT code_page)
6768 {
6769 if (code_page == CP_UTF7) {
6770 /* The CP_UTF7 decoder only supports flags=0 */
6771 return 0;
6772 }
6773 else
6774 return MB_ERR_INVALID_CHARS;
6775 }
6776
6777 /*
6778 * Decode a byte string from a Windows code page into unicode object in strict
6779 * mode.
6780 *
6781 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6782 * WindowsError and returns -1 on other error.
6783 */
6784 static int
6785 decode_code_page_strict(UINT code_page,
6786 PyObject **v,
6787 const char *in,
6788 int insize)
6789 {
6790 const DWORD flags = decode_code_page_flags(code_page);
6791 wchar_t *out;
6792 DWORD outsize;
6793
6794 /* First get the size of the result */
6795 assert(insize > 0);
6796 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6797 if (outsize <= 0)
6798 goto error;
6799
6800 if (*v == NULL) {
6801 /* Create unicode object */
6802 *v = (PyObject*)_PyUnicode_New(outsize);
6803 if (*v == NULL)
6804 return -1;
6805 out = PyUnicode_AS_UNICODE(*v);
6806 }
6807 else {
6808 /* Extend unicode object */
6809 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6810 if (PyUnicode_Resize(v, n + outsize) < 0)
6811 return -1;
6812 out = PyUnicode_AS_UNICODE(*v) + n;
6813 }
6814
6815 /* Do the conversion */
6816 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6817 if (outsize <= 0)
6818 goto error;
6819 return insize;
6820
6821 error:
6822 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6823 return -2;
6824 PyErr_SetFromWindowsErr(0);
6825 return -1;
6826 }
6827
6828 /*
6829 * Decode a byte string from a code page into unicode object with an error
6830 * handler.
6831 *
6832 * Returns consumed size if succeed, or raise a WindowsError or
6833 * UnicodeDecodeError exception and returns -1 on error.
6834 */
6835 static int
6836 decode_code_page_errors(UINT code_page,
6837 PyObject **v,
6838 const char *in, const int size,
6839 const char *errors)
6840 {
6841 const char *startin = in;
6842 const char *endin = in + size;
6843 const DWORD flags = decode_code_page_flags(code_page);
6844 /* Ideally, we should get reason from FormatMessage. This is the Windows
6845 2000 English version of the message. */
6846 const char *reason = "No mapping for the Unicode character exists "
6847 "in the target code page.";
6848 /* each step cannot decode more than 1 character, but a character can be
6849 represented as a surrogate pair */
6850 wchar_t buffer[2], *startout, *out;
6851 int insize, outsize;
6852 PyObject *errorHandler = NULL;
6853 PyObject *exc = NULL;
6854 PyObject *encoding_obj = NULL;
6855 char *encoding;
6856 DWORD err;
6857 int ret = -1;
6858
6859 assert(size > 0);
6860
6861 encoding = code_page_name(code_page, &encoding_obj);
6862 if (encoding == NULL)
6863 return -1;
6864
6865 if (errors == NULL || strcmp(errors, "strict") == 0) {
6866 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6867 UnicodeDecodeError. */
6868 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6869 if (exc != NULL) {
6870 PyCodec_StrictErrors(exc);
6871 Py_CLEAR(exc);
6872 }
6873 goto error;
6874 }
6875
6876 if (*v == NULL) {
6877 /* Create unicode object */
6878 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6879 PyErr_NoMemory();
6880 goto error;
6881 }
6882 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6883 if (*v == NULL)
6884 goto error;
6885 startout = PyUnicode_AS_UNICODE(*v);
6886 }
6887 else {
6888 /* Extend unicode object */
6889 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6890 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6891 PyErr_NoMemory();
6892 goto error;
6893 }
6894 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6895 goto error;
6896 startout = PyUnicode_AS_UNICODE(*v) + n;
6897 }
6898
6899 /* Decode the byte string character per character */
6900 out = startout;
6901 while (in < endin)
6902 {
6903 /* Decode a character */
6904 insize = 1;
6905 do
6906 {
6907 outsize = MultiByteToWideChar(code_page, flags,
6908 in, insize,
6909 buffer, Py_ARRAY_LENGTH(buffer));
6910 if (outsize > 0)
6911 break;
6912 err = GetLastError();
6913 if (err != ERROR_NO_UNICODE_TRANSLATION
6914 && err != ERROR_INSUFFICIENT_BUFFER)
6915 {
6916 PyErr_SetFromWindowsErr(0);
6917 goto error;
6918 }
6919 insize++;
6920 }
6921 /* 4=maximum length of a UTF-8 sequence */
6922 while (insize <= 4 && (in + insize) <= endin);
6923
6924 if (outsize <= 0) {
6925 Py_ssize_t startinpos, endinpos, outpos;
6926
6927 startinpos = in - startin;
6928 endinpos = startinpos + 1;
6929 outpos = out - PyUnicode_AS_UNICODE(*v);
6930 if (unicode_decode_call_errorhandler(
6931 errors, &errorHandler,
6932 encoding, reason,
6933 &startin, &endin, &startinpos, &endinpos, &exc, &in,
6934 v, &outpos))
6935 {
6936 goto error;
6937 }
6938 out = PyUnicode_AS_UNICODE(*v) + outpos;
6939 }
6940 else {
6941 in += insize;
6942 memcpy(out, buffer, outsize * sizeof(wchar_t));
6943 out += outsize;
6944 }
6945 }
6946
6947 /* write a NUL character at the end */
6948 *out = 0;
6949
6950 /* Extend unicode object */
6951 outsize = out - startout;
6952 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
6953 if (PyUnicode_Resize(v, outsize) < 0)
6954 goto error;
6955 ret = size;
6956
6957 error:
6958 Py_XDECREF(encoding_obj);
6959 Py_XDECREF(errorHandler);
6960 Py_XDECREF(exc);
6961 return ret;
6962 }
6963
6964 static PyObject *
6965 decode_code_page_stateful(int code_page,
6966 const char *s, Py_ssize_t size,
6967 const char *errors, Py_ssize_t *consumed)
6968 {
6969 PyObject *v = NULL;
6970 int chunk_size, final, converted, done;
6971
6972 if (code_page < 0) {
6973 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6974 return NULL;
6975 }
6976
6977 if (consumed)
6978 *consumed = 0;
6979
6980 do
6981 {
6982 #ifdef NEED_RETRY
6983 if (size > INT_MAX) {
6984 chunk_size = INT_MAX;
6985 final = 0;
6986 done = 0;
6987 }
6988 else
6989 #endif
6990 {
6991 chunk_size = (int)size;
6992 final = (consumed == NULL);
6993 done = 1;
6994 }
6995
6996 /* Skip trailing lead-byte unless 'final' is set */
6997 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6998 --chunk_size;
6999
7000 if (chunk_size == 0 && done) {
7001 if (v != NULL)
7002 break;
7003 Py_INCREF(unicode_empty);
7004 return unicode_empty;
7005 }
7006
7007
7008 converted = decode_code_page_strict(code_page, &v,
7009 s, chunk_size);
7010 if (converted == -2)
7011 converted = decode_code_page_errors(code_page, &v,
7012 s, chunk_size,
7013 errors);
7014 assert(converted != 0);
7015
7016 if (converted < 0) {
7017 Py_XDECREF(v);
7018 return NULL;
7019 }
7020
7021 if (consumed)
7022 *consumed += converted;
7023
7024 s += converted;
7025 size -= converted;
7026 } while (!done);
7027
7028 return unicode_result(v);
7029 }
7030
7031 PyObject *
7032 PyUnicode_DecodeCodePageStateful(int code_page,
7033 const char *s,
7034 Py_ssize_t size,
7035 const char *errors,
7036 Py_ssize_t *consumed)
7037 {
7038 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7039 }
7040
7041 PyObject *
7042 PyUnicode_DecodeMBCSStateful(const char *s,
7043 Py_ssize_t size,
7044 const char *errors,
7045 Py_ssize_t *consumed)
7046 {
7047 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7048 }
7049
7050 PyObject *
7051 PyUnicode_DecodeMBCS(const char *s,
7052 Py_ssize_t size,
7053 const char *errors)
7054 {
7055 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7056 }
7057
7058 static DWORD
7059 encode_code_page_flags(UINT code_page, const char *errors)
7060 {
7061 if (code_page == CP_UTF8) {
7062 if (winver.dwMajorVersion >= 6)
7063 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7064 and later */
7065 return WC_ERR_INVALID_CHARS;
7066 else
7067 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7068 return 0;
7069 }
7070 else if (code_page == CP_UTF7) {
7071 /* CP_UTF7 only supports flags=0 */
7072 return 0;
7073 }
7074 else {
7075 if (errors != NULL && strcmp(errors, "replace") == 0)
7076 return 0;
7077 else
7078 return WC_NO_BEST_FIT_CHARS;
7079 }
7080 }
7081
7082 /*
7083 * Encode a Unicode string to a Windows code page into a byte string in strict
7084 * mode.
7085 *
7086 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7087 * a WindowsError and returns -1 on other error.
7088 */
7089 static int
7090 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7091 PyObject *unicode, Py_ssize_t offset, int len,
7092 const char* errors)
7093 {
7094 BOOL usedDefaultChar = FALSE;
7095 BOOL *pusedDefaultChar = &usedDefaultChar;
7096 int outsize;
7097 PyObject *exc = NULL;
7098 wchar_t *p;
7099 Py_ssize_t size;
7100 const DWORD flags = encode_code_page_flags(code_page, NULL);
7101 char *out;
7102 /* Create a substring so that we can get the UTF-16 representation
7103 of just the slice under consideration. */
7104 PyObject *substring;
7105
7106 assert(len > 0);
7107
7108 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7109 pusedDefaultChar = &usedDefaultChar;
7110 else
7111 pusedDefaultChar = NULL;
7112
7113 substring = PyUnicode_Substring(unicode, offset, offset+len);
7114 if (substring == NULL)
7115 return -1;
7116 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7117 if (p == NULL) {
7118 Py_DECREF(substring);
7119 return -1;
7120 }
7121
7122 /* First get the size of the result */
7123 outsize = WideCharToMultiByte(code_page, flags,
7124 p, size,
7125 NULL, 0,
7126 NULL, pusedDefaultChar);
7127 if (outsize <= 0)
7128 goto error;
7129 /* If we used a default char, then we failed! */
7130 if (pusedDefaultChar && *pusedDefaultChar) {
7131 Py_DECREF(substring);
7132 return -2;
7133 }
7134
7135 if (*outbytes == NULL) {
7136 /* Create string object */
7137 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7138 if (*outbytes == NULL) {
7139 Py_DECREF(substring);
7140 return -1;
7141 }
7142 out = PyBytes_AS_STRING(*outbytes);
7143 }
7144 else {
7145 /* Extend string object */
7146 const Py_ssize_t n = PyBytes_Size(*outbytes);
7147 if (outsize > PY_SSIZE_T_MAX - n) {
7148 PyErr_NoMemory();
7149 Py_DECREF(substring);
7150 return -1;
7151 }
7152 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7153 Py_DECREF(substring);
7154 return -1;
7155 }
7156 out = PyBytes_AS_STRING(*outbytes) + n;
7157 }
7158
7159 /* Do the conversion */
7160 outsize = WideCharToMultiByte(code_page, flags,
7161 p, size,
7162 out, outsize,
7163 NULL, pusedDefaultChar);
7164 Py_CLEAR(substring);
7165 if (outsize <= 0)
7166 goto error;
7167 if (pusedDefaultChar && *pusedDefaultChar)
7168 return -2;
7169 return 0;
7170
7171 error:
7172 Py_XDECREF(substring);
7173 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7174 return -2;
7175 PyErr_SetFromWindowsErr(0);
7176 return -1;
7177 }
7178
7179 /*
7180 * Encode a Unicode string to a Windows code page into a byte string using a
7181 * error handler.
7182 *
7183 * Returns consumed characters if succeed, or raise a WindowsError and returns
7184 * -1 on other error.
7185 */
7186 static int
7187 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7188 PyObject *unicode, Py_ssize_t unicode_offset,
7189 Py_ssize_t insize, const char* errors)
7190 {
7191 const DWORD flags = encode_code_page_flags(code_page, errors);
7192 Py_ssize_t pos = unicode_offset;
7193 Py_ssize_t endin = unicode_offset + insize;
7194 /* Ideally, we should get reason from FormatMessage. This is the Windows
7195 2000 English version of the message. */
7196 const char *reason = "invalid character";
7197 /* 4=maximum length of a UTF-8 sequence */
7198 char buffer[4];
7199 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7200 Py_ssize_t outsize;
7201 char *out;
7202 PyObject *errorHandler = NULL;
7203 PyObject *exc = NULL;
7204 PyObject *encoding_obj = NULL;
7205 char *encoding;
7206 Py_ssize_t newpos, newoutsize;
7207 PyObject *rep;
7208 int ret = -1;
7209
7210 assert(insize > 0);
7211
7212 encoding = code_page_name(code_page, &encoding_obj);
7213 if (encoding == NULL)
7214 return -1;
7215
7216 if (errors == NULL || strcmp(errors, "strict") == 0) {
7217 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7218 then we raise a UnicodeEncodeError. */
7219 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7220 if (exc != NULL) {
7221 PyCodec_StrictErrors(exc);
7222 Py_DECREF(exc);
7223 }
7224 Py_XDECREF(encoding_obj);
7225 return -1;
7226 }
7227
7228 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7229 pusedDefaultChar = &usedDefaultChar;
7230 else
7231 pusedDefaultChar = NULL;
7232
7233 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7234 PyErr_NoMemory();
7235 goto error;
7236 }
7237 outsize = insize * Py_ARRAY_LENGTH(buffer);
7238
7239 if (*outbytes == NULL) {
7240 /* Create string object */
7241 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7242 if (*outbytes == NULL)
7243 goto error;
7244 out = PyBytes_AS_STRING(*outbytes);
7245 }
7246 else {
7247 /* Extend string object */
7248 Py_ssize_t n = PyBytes_Size(*outbytes);
7249 if (n > PY_SSIZE_T_MAX - outsize) {
7250 PyErr_NoMemory();
7251 goto error;
7252 }
7253 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7254 goto error;
7255 out = PyBytes_AS_STRING(*outbytes) + n;
7256 }
7257
7258 /* Encode the string character per character */
7259 while (pos < endin)
7260 {
7261 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7262 wchar_t chars[2];
7263 int charsize;
7264 if (ch < 0x10000) {
7265 chars[0] = (wchar_t)ch;
7266 charsize = 1;
7267 }
7268 else {
7269 ch -= 0x10000;
7270 chars[0] = 0xd800 + (ch >> 10);
7271 chars[1] = 0xdc00 + (ch & 0x3ff);
7272 charsize = 2;
7273 }
7274
7275 outsize = WideCharToMultiByte(code_page, flags,
7276 chars, charsize,
7277 buffer, Py_ARRAY_LENGTH(buffer),
7278 NULL, pusedDefaultChar);
7279 if (outsize > 0) {
7280 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7281 {
7282 pos++;
7283 memcpy(out, buffer, outsize);
7284 out += outsize;
7285 continue;
7286 }
7287 }
7288 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7289 PyErr_SetFromWindowsErr(0);
7290 goto error;
7291 }
7292
7293 rep = unicode_encode_call_errorhandler(
7294 errors, &errorHandler, encoding, reason,
7295 unicode, &exc,
7296 pos, pos + 1, &newpos);
7297 if (rep == NULL)
7298 goto error;
7299 pos = newpos;
7300
7301 if (PyBytes_Check(rep)) {
7302 outsize = PyBytes_GET_SIZE(rep);
7303 if (outsize != 1) {
7304 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7305 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7306 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7307 Py_DECREF(rep);
7308 goto error;
7309 }
7310 out = PyBytes_AS_STRING(*outbytes) + offset;
7311 }
7312 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7313 out += outsize;
7314 }
7315 else {
7316 Py_ssize_t i;
7317 enum PyUnicode_Kind kind;
7318 void *data;
7319
7320 if (PyUnicode_READY(rep) < 0) {
7321 Py_DECREF(rep);
7322 goto error;
7323 }
7324
7325 outsize = PyUnicode_GET_LENGTH(rep);
7326 if (outsize != 1) {
7327 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7328 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7329 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7330 Py_DECREF(rep);
7331 goto error;
7332 }
7333 out = PyBytes_AS_STRING(*outbytes) + offset;
7334 }
7335 kind = PyUnicode_KIND(rep);
7336 data = PyUnicode_DATA(rep);
7337 for (i=0; i < outsize; i++) {
7338 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7339 if (ch > 127) {
7340 raise_encode_exception(&exc,
7341 encoding, unicode,
7342 pos, pos + 1,
7343 "unable to encode error handler result to ASCII");
7344 Py_DECREF(rep);
7345 goto error;
7346 }
7347 *out = (unsigned char)ch;
7348 out++;
7349 }
7350 }
7351 Py_DECREF(rep);
7352 }
7353 /* write a NUL byte */
7354 *out = 0;
7355 outsize = out - PyBytes_AS_STRING(*outbytes);
7356 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7357 if (_PyBytes_Resize(outbytes, outsize) < 0)
7358 goto error;
7359 ret = 0;
7360
7361 error:
7362 Py_XDECREF(encoding_obj);
7363 Py_XDECREF(errorHandler);
7364 Py_XDECREF(exc);
7365 return ret;
7366 }
7367
7368 static PyObject *
7369 encode_code_page(int code_page,
7370 PyObject *unicode,
7371 const char *errors)
7372 {
7373 Py_ssize_t len;
7374 PyObject *outbytes = NULL;
7375 Py_ssize_t offset;
7376 int chunk_len, ret, done;
7377
7378 if (PyUnicode_READY(unicode) < 0)
7379 return NULL;
7380 len = PyUnicode_GET_LENGTH(unicode);
7381
7382 if (code_page < 0) {
7383 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7384 return NULL;
7385 }
7386
7387 if (len == 0)
7388 return PyBytes_FromStringAndSize(NULL, 0);
7389
7390 offset = 0;
7391 do
7392 {
7393 #ifdef NEED_RETRY
7394 /* UTF-16 encoding may double the size, so use only INT_MAX/2
7395 chunks. */
7396 if (len > INT_MAX/2) {
7397 chunk_len = INT_MAX/2;
7398 done = 0;
7399 }
7400 else
7401 #endif
7402 {
7403 chunk_len = (int)len;
7404 done = 1;
7405 }
7406
7407 ret = encode_code_page_strict(code_page, &outbytes,
7408 unicode, offset, chunk_len,
7409 errors);
7410 if (ret == -2)
7411 ret = encode_code_page_errors(code_page, &outbytes,
7412 unicode, offset,
7413 chunk_len, errors);
7414 if (ret < 0) {
7415 Py_XDECREF(outbytes);
7416 return NULL;
7417 }
7418
7419 offset += chunk_len;
7420 len -= chunk_len;
7421 } while (!done);
7422
7423 return outbytes;
7424 }
7425
7426 PyObject *
7427 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7428 Py_ssize_t size,
7429 const char *errors)
7430 {
7431 PyObject *unicode, *res;
7432 unicode = PyUnicode_FromUnicode(p, size);
7433 if (unicode == NULL)
7434 return NULL;
7435 res = encode_code_page(CP_ACP, unicode, errors);
7436 Py_DECREF(unicode);
7437 return res;
7438 }
7439
7440 PyObject *
7441 PyUnicode_EncodeCodePage(int code_page,
7442 PyObject *unicode,
7443 const char *errors)
7444 {
7445 return encode_code_page(code_page, unicode, errors);
7446 }
7447
7448 PyObject *
7449 PyUnicode_AsMBCSString(PyObject *unicode)
7450 {
7451 if (!PyUnicode_Check(unicode)) {
7452 PyErr_BadArgument();
7453 return NULL;
7454 }
7455 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7456 }
7457
7458 #undef NEED_RETRY
7459
7460 #endif /* HAVE_MBCS */
7461
7462 /* --- Character Mapping Codec -------------------------------------------- */
7463
7464 PyObject *
7465 PyUnicode_DecodeCharmap(const char *s,
7466 Py_ssize_t size,
7467 PyObject *mapping,
7468 const char *errors)
7469 {
7470 const char *starts = s;
7471 Py_ssize_t startinpos;
7472 Py_ssize_t endinpos;
7473 Py_ssize_t outpos;
7474 const char *e;
7475 PyObject *v;
7476 Py_ssize_t extrachars = 0;
7477 PyObject *errorHandler = NULL;
7478 PyObject *exc = NULL;
7479
7480 /* Default to Latin-1 */
7481 if (mapping == NULL)
7482 return PyUnicode_DecodeLatin1(s, size, errors);
7483
7484 v = PyUnicode_New(size, 127);
7485 if (v == NULL)
7486 goto onError;
7487 if (size == 0)
7488 return v;
7489 outpos = 0;
7490 e = s + size;
7491 if (PyUnicode_CheckExact(mapping)) {
7492 Py_ssize_t maplen;
7493 enum PyUnicode_Kind kind;
7494 void *data;
7495 Py_UCS4 x;
7496
7497 if (PyUnicode_READY(mapping) < 0)
7498 return NULL;
7499
7500 maplen = PyUnicode_GET_LENGTH(mapping);
7501 data = PyUnicode_DATA(mapping);
7502 kind = PyUnicode_KIND(mapping);
7503 while (s < e) {
7504 unsigned char ch = *s;
7505
7506 if (ch < maplen)
7507 x = PyUnicode_READ(kind, data, ch);
7508 else
7509 x = 0xfffe; /* invalid value */
7510
7511 if (x == 0xfffe)
7512 {
7513 /* undefined mapping */
7514 startinpos = s-starts;
7515 endinpos = startinpos+1;
7516 if (unicode_decode_call_errorhandler(
7517 errors, &errorHandler,
7518 "charmap", "character maps to <undefined>",
7519 &starts, &e, &startinpos, &endinpos, &exc, &s,
7520 &v, &outpos)) {
7521 goto onError;
7522 }
7523 continue;
7524 }
7525
7526 if (unicode_putchar(&v, &outpos, x) < 0)
7527 goto onError;
7528 ++s;
7529 }
7530 }
7531 else {
7532 while (s < e) {
7533 unsigned char ch = *s;
7534 PyObject *w, *x;
7535
7536 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7537 w = PyLong_FromLong((long)ch);
7538 if (w == NULL)
7539 goto onError;
7540 x = PyObject_GetItem(mapping, w);
7541 Py_DECREF(w);
7542 if (x == NULL) {
7543 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7544 /* No mapping found means: mapping is undefined. */
7545 PyErr_Clear();
7546 x = Py_None;
7547 Py_INCREF(x);
7548 } else
7549 goto onError;
7550 }
7551
7552 /* Apply mapping */
7553 if (PyLong_Check(x)) {
7554 long value = PyLong_AS_LONG(x);
7555 if (value < 0 || value > 65535) {
7556 PyErr_SetString(PyExc_TypeError,
7557 "character mapping must be in range(65536)");
7558 Py_DECREF(x);
7559 goto onError;
7560 }
7561 if (unicode_putchar(&v, &outpos, value) < 0)
7562 goto onError;
7563 }
7564 else if (x == Py_None) {
7565 /* undefined mapping */
7566 startinpos = s-starts;
7567 endinpos = startinpos+1;
7568 if (unicode_decode_call_errorhandler(
7569 errors, &errorHandler,
7570 "charmap", "character maps to <undefined>",
7571 &starts, &e, &startinpos, &endinpos, &exc, &s,
7572 &v, &outpos)) {
7573 Py_DECREF(x);
7574 goto onError;
7575 }
7576 Py_DECREF(x);
7577 continue;
7578 }
7579 else if (PyUnicode_Check(x)) {
7580 Py_ssize_t targetsize;
7581
7582 if (PyUnicode_READY(x) < 0)
7583 goto onError;
7584 targetsize = PyUnicode_GET_LENGTH(x);
7585
7586 if (targetsize == 1) {
7587 /* 1-1 mapping */
7588 if (unicode_putchar(&v, &outpos,
7589 PyUnicode_READ_CHAR(x, 0)) < 0)
7590 goto onError;
7591 }
7592 else if (targetsize > 1) {
7593 /* 1-n mapping */
7594 if (targetsize > extrachars) {
7595 /* resize first */
7596 Py_ssize_t needed = (targetsize - extrachars) + \
7597 (targetsize << 2);
7598 extrachars += needed;
7599 /* XXX overflow detection missing */
7600 if (PyUnicode_Resize(&v,
7601 PyUnicode_GET_LENGTH(v) + needed) < 0) {
7602 Py_DECREF(x);
7603 goto onError;
7604 }
7605 }
7606 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7607 goto onError;
7608 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7609 outpos += targetsize;
7610 extrachars -= targetsize;
7611 }
7612 /* 1-0 mapping: skip the character */
7613 }
7614 else {
7615 /* wrong return value */
7616 PyErr_SetString(PyExc_TypeError,
7617 "character mapping must return integer, None or str");
7618 Py_DECREF(x);
7619 goto onError;
7620 }
7621 Py_DECREF(x);
7622 ++s;
7623 }
7624 }
7625 if (PyUnicode_Resize(&v, outpos) < 0)
7626 goto onError;
7627 Py_XDECREF(errorHandler);
7628 Py_XDECREF(exc);
7629 return unicode_result(v);
7630
7631 onError:
7632 Py_XDECREF(errorHandler);
7633 Py_XDECREF(exc);
7634 Py_XDECREF(v);
7635 return NULL;
7636 }
7637
7638 /* Charmap encoding: the lookup table */
7639
7640 struct encoding_map {
7641 PyObject_HEAD
7642 unsigned char level1[32];
7643 int count2, count3;
7644 unsigned char level23[1];
7645 };
7646
7647 static PyObject*
7648 encoding_map_size(PyObject *obj, PyObject* args)
7649 {
7650 struct encoding_map *map = (struct encoding_map*)obj;
7651 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7652 128*map->count3);
7653 }
7654
7655 static PyMethodDef encoding_map_methods[] = {
7656 {"size", encoding_map_size, METH_NOARGS,
7657 PyDoc_STR("Return the size (in bytes) of this object") },
7658 { 0 }
7659 };
7660
7661 static void
7662 encoding_map_dealloc(PyObject* o)
7663 {
7664 PyObject_FREE(o);
7665 }
7666
7667 static PyTypeObject EncodingMapType = {
7668 PyVarObject_HEAD_INIT(NULL, 0)
7669 "EncodingMap", /*tp_name*/
7670 sizeof(struct encoding_map), /*tp_basicsize*/
7671 0, /*tp_itemsize*/
7672 /* methods */
7673 encoding_map_dealloc, /*tp_dealloc*/
7674 0, /*tp_print*/
7675 0, /*tp_getattr*/
7676 0, /*tp_setattr*/
7677 0, /*tp_reserved*/
7678 0, /*tp_repr*/
7679 0, /*tp_as_number*/
7680 0, /*tp_as_sequence*/
7681 0, /*tp_as_mapping*/
7682 0, /*tp_hash*/
7683 0, /*tp_call*/
7684 0, /*tp_str*/
7685 0, /*tp_getattro*/
7686 0, /*tp_setattro*/
7687 0, /*tp_as_buffer*/
7688 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7689 0, /*tp_doc*/
7690 0, /*tp_traverse*/
7691 0, /*tp_clear*/
7692 0, /*tp_richcompare*/
7693 0, /*tp_weaklistoffset*/
7694 0, /*tp_iter*/
7695 0, /*tp_iternext*/
7696 encoding_map_methods, /*tp_methods*/
7697 0, /*tp_members*/
7698 0, /*tp_getset*/
7699 0, /*tp_base*/
7700 0, /*tp_dict*/
7701 0, /*tp_descr_get*/
7702 0, /*tp_descr_set*/
7703 0, /*tp_dictoffset*/
7704 0, /*tp_init*/
7705 0, /*tp_alloc*/
7706 0, /*tp_new*/
7707 0, /*tp_free*/
7708 0, /*tp_is_gc*/
7709 };
7710
7711 PyObject*
7712 PyUnicode_BuildEncodingMap(PyObject* string)
7713 {
7714 PyObject *result;
7715 struct encoding_map *mresult;
7716 int i;
7717 int need_dict = 0;
7718 unsigned char level1[32];
7719 unsigned char level2[512];
7720 unsigned char *mlevel1, *mlevel2, *mlevel3;
7721 int count2 = 0, count3 = 0;
7722 int kind;
7723 void *data;
7724 Py_UCS4 ch;
7725
7726 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7727 PyErr_BadArgument();
7728 return NULL;
7729 }
7730 kind = PyUnicode_KIND(string);
7731 data = PyUnicode_DATA(string);
7732 memset(level1, 0xFF, sizeof level1);
7733 memset(level2, 0xFF, sizeof level2);
7734
7735 /* If there isn't a one-to-one mapping of NULL to \0,
7736 or if there are non-BMP characters, we need to use
7737 a mapping dictionary. */
7738 if (PyUnicode_READ(kind, data, 0) != 0)
7739 need_dict = 1;
7740 for (i = 1; i < 256; i++) {
7741 int l1, l2;
7742 ch = PyUnicode_READ(kind, data, i);
7743 if (ch == 0 || ch > 0xFFFF) {
7744 need_dict = 1;
7745 break;
7746 }
7747 if (ch == 0xFFFE)
7748 /* unmapped character */
7749 continue;
7750 l1 = ch >> 11;
7751 l2 = ch >> 7;
7752 if (level1[l1] == 0xFF)
7753 level1[l1] = count2++;
7754 if (level2[l2] == 0xFF)
7755 level2[l2] = count3++;
7756 }
7757
7758 if (count2 >= 0xFF || count3 >= 0xFF)
7759 need_dict = 1;
7760
7761 if (need_dict) {
7762 PyObject *result = PyDict_New();
7763 PyObject *key, *value;
7764 if (!result)
7765 return NULL;
7766 for (i = 0; i < 256; i++) {
7767 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7768 value = PyLong_FromLong(i);
7769 if (!key || !value)
7770 goto failed1;
7771 if (PyDict_SetItem(result, key, value) == -1)
7772 goto failed1;
7773 Py_DECREF(key);
7774 Py_DECREF(value);
7775 }
7776 return result;
7777 failed1:
7778 Py_XDECREF(key);
7779 Py_XDECREF(value);
7780 Py_DECREF(result);
7781 return NULL;
7782 }
7783
7784 /* Create a three-level trie */
7785 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7786 16*count2 + 128*count3 - 1);
7787 if (!result)
7788 return PyErr_NoMemory();
7789 PyObject_Init(result, &EncodingMapType);
7790 mresult = (struct encoding_map*)result;
7791 mresult->count2 = count2;
7792 mresult->count3 = count3;
7793 mlevel1 = mresult->level1;
7794 mlevel2 = mresult->level23;
7795 mlevel3 = mresult->level23 + 16*count2;
7796 memcpy(mlevel1, level1, 32);
7797 memset(mlevel2, 0xFF, 16*count2);
7798 memset(mlevel3, 0, 128*count3);
7799 count3 = 0;
7800 for (i = 1; i < 256; i++) {
7801 int o1, o2, o3, i2, i3;
7802 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7803 /* unmapped character */
7804 continue;
7805 o1 = PyUnicode_READ(kind, data, i)>>11;
7806 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7807 i2 = 16*mlevel1[o1] + o2;
7808 if (mlevel2[i2] == 0xFF)
7809 mlevel2[i2] = count3++;
7810 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7811 i3 = 128*mlevel2[i2] + o3;
7812 mlevel3[i3] = i;
7813 }
7814 return result;
7815 }
7816
7817 static int
7818 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7819 {
7820 struct encoding_map *map = (struct encoding_map*)mapping;
7821 int l1 = c>>11;
7822 int l2 = (c>>7) & 0xF;
7823 int l3 = c & 0x7F;
7824 int i;
7825
7826 if (c > 0xFFFF)
7827 return -1;
7828 if (c == 0)
7829 return 0;
7830 /* level 1*/
7831 i = map->level1[l1];
7832 if (i == 0xFF) {
7833 return -1;
7834 }
7835 /* level 2*/
7836 i = map->level23[16*i+l2];
7837 if (i == 0xFF) {
7838 return -1;
7839 }
7840 /* level 3 */
7841 i = map->level23[16*map->count2 + 128*i + l3];
7842 if (i == 0) {
7843 return -1;
7844 }
7845 return i;
7846 }
7847
7848 /* Lookup the character ch in the mapping. If the character
7849 can't be found, Py_None is returned (or NULL, if another
7850 error occurred). */
7851 static PyObject *
7852 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7853 {
7854 PyObject *w = PyLong_FromLong((long)c);
7855 PyObject *x;
7856
7857 if (w == NULL)
7858 return NULL;
7859 x = PyObject_GetItem(mapping, w);
7860 Py_DECREF(w);
7861 if (x == NULL) {
7862 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7863 /* No mapping found means: mapping is undefined. */
7864 PyErr_Clear();
7865 x = Py_None;
7866 Py_INCREF(x);
7867 return x;
7868 } else
7869 return NULL;
7870 }
7871 else if (x == Py_None)
7872 return x;
7873 else if (PyLong_Check(x)) {
7874 long value = PyLong_AS_LONG(x);
7875 if (value < 0 || value > 255) {
7876 PyErr_SetString(PyExc_TypeError,
7877 "character mapping must be in range(256)");
7878 Py_DECREF(x);
7879 return NULL;
7880 }
7881 return x;
7882 }
7883 else if (PyBytes_Check(x))
7884 return x;
7885 else {
7886 /* wrong return value */
7887 PyErr_Format(PyExc_TypeError,
7888 "character mapping must return integer, bytes or None, not %.400s",
7889 x->ob_type->tp_name);
7890 Py_DECREF(x);
7891 return NULL;
7892 }
7893 }
7894
7895 static int
7896 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7897 {
7898 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7899 /* exponentially overallocate to minimize reallocations */
7900 if (requiredsize < 2*outsize)
7901 requiredsize = 2*outsize;
7902 if (_PyBytes_Resize(outobj, requiredsize))
7903 return -1;
7904 return 0;
7905 }
7906
7907 typedef enum charmapencode_result {
7908 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7909 } charmapencode_result;
7910 /* lookup the character, put the result in the output string and adjust
7911 various state variables. Resize the output bytes object if not enough
7912 space is available. Return a new reference to the object that
7913 was put in the output buffer, or Py_None, if the mapping was undefined
7914 (in which case no character was written) or NULL, if a
7915 reallocation error occurred. The caller must decref the result */
7916 static charmapencode_result
7917 charmapencode_output(Py_UCS4 c, PyObject *mapping,
7918 PyObject **outobj, Py_ssize_t *outpos)
7919 {
7920 PyObject *rep;
7921 char *outstart;
7922 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7923
7924 if (Py_TYPE(mapping) == &EncodingMapType) {
7925 int res = encoding_map_lookup(c, mapping);
7926 Py_ssize_t requiredsize = *outpos+1;
7927 if (res == -1)
7928 return enc_FAILED;
7929 if (outsize<requiredsize)
7930 if (charmapencode_resize(outobj, outpos, requiredsize))
7931 return enc_EXCEPTION;
7932 outstart = PyBytes_AS_STRING(*outobj);
7933 outstart[(*outpos)++] = (char)res;
7934 return enc_SUCCESS;
7935 }
7936
7937 rep = charmapencode_lookup(c, mapping);
7938 if (rep==NULL)
7939 return enc_EXCEPTION;
7940 else if (rep==Py_None) {
7941 Py_DECREF(rep);
7942 return enc_FAILED;
7943 } else {
7944 if (PyLong_Check(rep)) {
7945 Py_ssize_t requiredsize = *outpos+1;
7946 if (outsize<requiredsize)
7947 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7948 Py_DECREF(rep);
7949 return enc_EXCEPTION;
7950 }
7951 outstart = PyBytes_AS_STRING(*outobj);
7952 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7953 }
7954 else {
7955 const char *repchars = PyBytes_AS_STRING(rep);
7956 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7957 Py_ssize_t requiredsize = *outpos+repsize;
7958 if (outsize<requiredsize)
7959 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7960 Py_DECREF(rep);
7961 return enc_EXCEPTION;
7962 }
7963 outstart = PyBytes_AS_STRING(*outobj);
7964 memcpy(outstart + *outpos, repchars, repsize);
7965 *outpos += repsize;
7966 }
7967 }
7968 Py_DECREF(rep);
7969 return enc_SUCCESS;
7970 }
7971
7972 /* handle an error in PyUnicode_EncodeCharmap
7973 Return 0 on success, -1 on error */
7974 static int
7975 charmap_encoding_error(
7976 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
7977 PyObject **exceptionObject,
7978 int *known_errorHandler, PyObject **errorHandler, const char *errors,
7979 PyObject **res, Py_ssize_t *respos)
7980 {
7981 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7982 Py_ssize_t size, repsize;
7983 Py_ssize_t newpos;
7984 enum PyUnicode_Kind kind;
7985 void *data;
7986 Py_ssize_t index;
7987 /* startpos for collecting unencodable chars */
7988 Py_ssize_t collstartpos = *inpos;
7989 Py_ssize_t collendpos = *inpos+1;
7990 Py_ssize_t collpos;
7991 char *encoding = "charmap";
7992 char *reason = "character maps to <undefined>";
7993 charmapencode_result x;
7994 Py_UCS4 ch;
7995 int val;
7996
7997 if (PyUnicode_READY(unicode) < 0)
7998 return -1;
7999 size = PyUnicode_GET_LENGTH(unicode);
8000 /* find all unencodable characters */
8001 while (collendpos < size) {
8002 PyObject *rep;
8003 if (Py_TYPE(mapping) == &EncodingMapType) {
8004 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8005 val = encoding_map_lookup(ch, mapping);
8006 if (val != -1)
8007 break;
8008 ++collendpos;
8009 continue;
8010 }
8011
8012 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8013 rep = charmapencode_lookup(ch, mapping);
8014 if (rep==NULL)
8015 return -1;
8016 else if (rep!=Py_None) {
8017 Py_DECREF(rep);
8018 break;
8019 }
8020 Py_DECREF(rep);
8021 ++collendpos;
8022 }
8023 /* cache callback name lookup
8024 * (if not done yet, i.e. it's the first error) */
8025 if (*known_errorHandler==-1) {
8026 if ((errors==NULL) || (!strcmp(errors, "strict")))
8027 *known_errorHandler = 1;
8028 else if (!strcmp(errors, "replace"))
8029 *known_errorHandler = 2;
8030 else if (!strcmp(errors, "ignore"))
8031 *known_errorHandler = 3;
8032 else if (!strcmp(errors, "xmlcharrefreplace"))
8033 *known_errorHandler = 4;
8034 else
8035 *known_errorHandler = 0;
8036 }
8037 switch (*known_errorHandler) {
8038 case 1: /* strict */
8039 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8040 return -1;
8041 case 2: /* replace */
8042 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8043 x = charmapencode_output('?', mapping, res, respos);
8044 if (x==enc_EXCEPTION) {
8045 return -1;
8046 }
8047 else if (x==enc_FAILED) {
8048 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8049 return -1;
8050 }
8051 }
8052 /* fall through */
8053 case 3: /* ignore */
8054 *inpos = collendpos;
8055 break;
8056 case 4: /* xmlcharrefreplace */
8057 /* generate replacement (temporarily (mis)uses p) */
8058 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8059 char buffer[2+29+1+1];
8060 char *cp;
8061 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8062 for (cp = buffer; *cp; ++cp) {
8063 x = charmapencode_output(*cp, mapping, res, respos);
8064 if (x==enc_EXCEPTION)
8065 return -1;
8066 else if (x==enc_FAILED) {
8067 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8068 return -1;
8069 }
8070 }
8071 }
8072 *inpos = collendpos;
8073 break;
8074 default:
8075 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8076 encoding, reason, unicode, exceptionObject,
8077 collstartpos, collendpos, &newpos);
8078 if (repunicode == NULL)
8079 return -1;
8080 if (PyBytes_Check(repunicode)) {
8081 /* Directly copy bytes result to output. */
8082 Py_ssize_t outsize = PyBytes_Size(*res);
8083 Py_ssize_t requiredsize;
8084 repsize = PyBytes_Size(repunicode);
8085 requiredsize = *respos + repsize;
8086 if (requiredsize > outsize)
8087 /* Make room for all additional bytes. */
8088 if (charmapencode_resize(res, respos, requiredsize)) {
8089 Py_DECREF(repunicode);
8090 return -1;
8091 }
8092 memcpy(PyBytes_AsString(*res) + *respos,
8093 PyBytes_AsString(repunicode), repsize);
8094 *respos += repsize;
8095 *inpos = newpos;
8096 Py_DECREF(repunicode);
8097 break;
8098 }
8099 /* generate replacement */
8100 if (PyUnicode_READY(repunicode) < 0) {
8101 Py_DECREF(repunicode);
8102 return -1;
8103 }
8104 repsize = PyUnicode_GET_LENGTH(repunicode);
8105 data = PyUnicode_DATA(repunicode);
8106 kind = PyUnicode_KIND(repunicode);
8107 for (index = 0; index < repsize; index++) {
8108 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8109 x = charmapencode_output(repch, mapping, res, respos);
8110 if (x==enc_EXCEPTION) {
8111 Py_DECREF(repunicode);
8112 return -1;
8113 }
8114 else if (x==enc_FAILED) {
8115 Py_DECREF(repunicode);
8116 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8117 return -1;
8118 }
8119 }
8120 *inpos = newpos;
8121 Py_DECREF(repunicode);
8122 }
8123 return 0;
8124 }
8125
8126 PyObject *
8127 _PyUnicode_EncodeCharmap(PyObject *unicode,
8128 PyObject *mapping,
8129 const char *errors)
8130 {
8131 /* output object */
8132 PyObject *res = NULL;
8133 /* current input position */
8134 Py_ssize_t inpos = 0;
8135 Py_ssize_t size;
8136 /* current output position */
8137 Py_ssize_t respos = 0;
8138 PyObject *errorHandler = NULL;
8139 PyObject *exc = NULL;
8140 /* the following variable is used for caching string comparisons
8141 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8142 * 3=ignore, 4=xmlcharrefreplace */
8143 int known_errorHandler = -1;
8144
8145 if (PyUnicode_READY(unicode) < 0)
8146 return NULL;
8147 size = PyUnicode_GET_LENGTH(unicode);
8148
8149 /* Default to Latin-1 */
8150 if (mapping == NULL)
8151 return unicode_encode_ucs1(unicode, errors, 256);
8152
8153 /* allocate enough for a simple encoding without
8154 replacements, if we need more, we'll resize */
8155 res = PyBytes_FromStringAndSize(NULL, size);
8156 if (res == NULL)
8157 goto onError;
8158 if (size == 0)
8159 return res;
8160
8161 while (inpos<size) {
8162 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
8163 /* try to encode it */
8164 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8165 if (x==enc_EXCEPTION) /* error */
8166 goto onError;
8167 if (x==enc_FAILED) { /* unencodable character */
8168 if (charmap_encoding_error(unicode, &inpos, mapping,
8169 &exc,
8170 &known_errorHandler, &errorHandler, errors,
8171 &res, &respos)) {
8172 goto onError;
8173 }
8174 }
8175 else
8176 /* done with this character => adjust input position */
8177 ++inpos;
8178 }
8179
8180 /* Resize if we allocated to much */
8181 if (respos<PyBytes_GET_SIZE(res))
8182 if (_PyBytes_Resize(&res, respos) < 0)
8183 goto onError;
8184
8185 Py_XDECREF(exc);
8186 Py_XDECREF(errorHandler);
8187 return res;
8188
8189 onError:
8190 Py_XDECREF(res);
8191 Py_XDECREF(exc);
8192 Py_XDECREF(errorHandler);
8193 return NULL;
8194 }
8195
8196 /* Deprecated */
8197 PyObject *
8198 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8199 Py_ssize_t size,
8200 PyObject *mapping,
8201 const char *errors)
8202 {
8203 PyObject *result;
8204 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8205 if (unicode == NULL)
8206 return NULL;
8207 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8208 Py_DECREF(unicode);
8209 return result;
8210 }
8211
8212 PyObject *
8213 PyUnicode_AsCharmapString(PyObject *unicode,
8214 PyObject *mapping)
8215 {
8216 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8217 PyErr_BadArgument();
8218 return NULL;
8219 }
8220 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8221 }
8222
8223 /* create or adjust a UnicodeTranslateError */
8224 static void
8225 make_translate_exception(PyObject **exceptionObject,
8226 PyObject *unicode,
8227 Py_ssize_t startpos, Py_ssize_t endpos,
8228 const char *reason)
8229 {
8230 if (*exceptionObject == NULL) {
8231 *exceptionObject = _PyUnicodeTranslateError_Create(
8232 unicode, startpos, endpos, reason);
8233 }
8234 else {
8235 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8236 goto onError;
8237 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8238 goto onError;
8239 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8240 goto onError;
8241 return;
8242 onError:
8243 Py_DECREF(*exceptionObject);
8244 *exceptionObject = NULL;
8245 }
8246 }
8247
8248 /* raises a UnicodeTranslateError */
8249 static void
8250 raise_translate_exception(PyObject **exceptionObject,
8251 PyObject *unicode,
8252 Py_ssize_t startpos, Py_ssize_t endpos,
8253 const char *reason)
8254 {
8255 make_translate_exception(exceptionObject,
8256 unicode, startpos, endpos, reason);
8257 if (*exceptionObject != NULL)
8258 PyCodec_StrictErrors(*exceptionObject);
8259 }
8260
8261 /* error handling callback helper:
8262 build arguments, call the callback and check the arguments,
8263 put the result into newpos and return the replacement string, which
8264 has to be freed by the caller */
8265 static PyObject *
8266 unicode_translate_call_errorhandler(const char *errors,
8267 PyObject **errorHandler,
8268 const char *reason,
8269 PyObject *unicode, PyObject **exceptionObject,
8270 Py_ssize_t startpos, Py_ssize_t endpos,
8271 Py_ssize_t *newpos)
8272 {
8273 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8274
8275 Py_ssize_t i_newpos;
8276 PyObject *restuple;
8277 PyObject *resunicode;
8278
8279 if (*errorHandler == NULL) {
8280 *errorHandler = PyCodec_LookupError(errors);
8281 if (*errorHandler == NULL)
8282 return NULL;
8283 }
8284
8285 make_translate_exception(exceptionObject,
8286 unicode, startpos, endpos, reason);
8287 if (*exceptionObject == NULL)
8288 return NULL;
8289
8290 restuple = PyObject_CallFunctionObjArgs(
8291 *errorHandler, *exceptionObject, NULL);
8292 if (restuple == NULL)
8293 return NULL;
8294 if (!PyTuple_Check(restuple)) {
8295 PyErr_SetString(PyExc_TypeError, &argparse[4]);
8296 Py_DECREF(restuple);
8297 return NULL;
8298 }
8299 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8300 &resunicode, &i_newpos)) {
8301 Py_DECREF(restuple);
8302 return NULL;
8303 }
8304 if (i_newpos<0)
8305 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8306 else
8307 *newpos = i_newpos;
8308 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8309 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8310 Py_DECREF(restuple);
8311 return NULL;
8312 }
8313 Py_INCREF(resunicode);
8314 Py_DECREF(restuple);
8315 return resunicode;
8316 }
8317
8318 /* Lookup the character ch in the mapping and put the result in result,
8319 which must be decrefed by the caller.
8320 Return 0 on success, -1 on error */
8321 static int
8322 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8323 {
8324 PyObject *w = PyLong_FromLong((long)c);
8325 PyObject *x;
8326
8327 if (w == NULL)
8328 return -1;
8329 x = PyObject_GetItem(mapping, w);
8330 Py_DECREF(w);
8331 if (x == NULL) {
8332 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8333 /* No mapping found means: use 1:1 mapping. */
8334 PyErr_Clear();
8335 *result = NULL;
8336 return 0;
8337 } else
8338 return -1;
8339 }
8340 else if (x == Py_None) {
8341 *result = x;
8342 return 0;
8343 }
8344 else if (PyLong_Check(x)) {
8345 long value = PyLong_AS_LONG(x);
8346 long max = PyUnicode_GetMax();
8347 if (value < 0 || value > max) {
8348 PyErr_Format(PyExc_TypeError,
8349 "character mapping must be in range(0x%x)", max+1);
8350 Py_DECREF(x);
8351 return -1;
8352 }
8353 *result = x;
8354 return 0;
8355 }
8356 else if (PyUnicode_Check(x)) {
8357 *result = x;
8358 return 0;
8359 }
8360 else {
8361 /* wrong return value */
8362 PyErr_SetString(PyExc_TypeError,
8363 "character mapping must return integer, None or str");
8364 Py_DECREF(x);
8365 return -1;
8366 }
8367 }
8368 /* ensure that *outobj is at least requiredsize characters long,
8369 if not reallocate and adjust various state variables.
8370 Return 0 on success, -1 on error */
8371 static int
8372 charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8373 Py_ssize_t requiredsize)
8374 {
8375 Py_ssize_t oldsize = *psize;
8376 if (requiredsize > oldsize) {
8377 /* exponentially overallocate to minimize reallocations */
8378 if (requiredsize < 2 * oldsize)
8379 requiredsize = 2 * oldsize;
8380 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8381 if (*outobj == 0)
8382 return -1;
8383 *psize = requiredsize;
8384 }
8385 return 0;
8386 }
8387 /* lookup the character, put the result in the output string and adjust
8388 various state variables. Return a new reference to the object that
8389 was put in the output buffer in *result, or Py_None, if the mapping was
8390 undefined (in which case no character was written).
8391 The called must decref result.
8392 Return 0 on success, -1 on error. */
8393 static int
8394 charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8395 PyObject *mapping, Py_UCS4 **output,
8396 Py_ssize_t *osize, Py_ssize_t *opos,
8397 PyObject **res)
8398 {
8399 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8400 if (charmaptranslate_lookup(curinp, mapping, res))
8401 return -1;
8402 if (*res==NULL) {
8403 /* not found => default to 1:1 mapping */
8404 (*output)[(*opos)++] = curinp;
8405 }
8406 else if (*res==Py_None)
8407 ;
8408 else if (PyLong_Check(*res)) {
8409 /* no overflow check, because we know that the space is enough */
8410 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8411 }
8412 else if (PyUnicode_Check(*res)) {
8413 Py_ssize_t repsize;
8414 if (PyUnicode_READY(*res) == -1)
8415 return -1;
8416 repsize = PyUnicode_GET_LENGTH(*res);
8417 if (repsize==1) {
8418 /* no overflow check, because we know that the space is enough */
8419 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8420 }
8421 else if (repsize!=0) {
8422 /* more than one character */
8423 Py_ssize_t requiredsize = *opos +
8424 (PyUnicode_GET_LENGTH(input) - ipos) +
8425 repsize - 1;
8426 Py_ssize_t i;
8427 if (charmaptranslate_makespace(output, osize, requiredsize))
8428 return -1;
8429 for(i = 0; i < repsize; i++)
8430 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8431 }
8432 }
8433 else
8434 return -1;
8435 return 0;
8436 }
8437
8438 PyObject *
8439 _PyUnicode_TranslateCharmap(PyObject *input,
8440 PyObject *mapping,
8441 const char *errors)
8442 {
8443 /* input object */
8444 char *idata;
8445 Py_ssize_t size, i;
8446 int kind;
8447 /* output buffer */
8448 Py_UCS4 *output = NULL;
8449 Py_ssize_t osize;
8450 PyObject *res;
8451 /* current output position */
8452 Py_ssize_t opos;
8453 char *reason = "character maps to <undefined>";
8454 PyObject *errorHandler = NULL;
8455 PyObject *exc = NULL;
8456 /* the following variable is used for caching string comparisons
8457 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8458 * 3=ignore, 4=xmlcharrefreplace */
8459 int known_errorHandler = -1;
8460
8461 if (mapping == NULL) {
8462 PyErr_BadArgument();
8463 return NULL;
8464 }
8465
8466 if (PyUnicode_READY(input) == -1)
8467 return NULL;
8468 idata = (char*)PyUnicode_DATA(input);
8469 kind = PyUnicode_KIND(input);
8470 size = PyUnicode_GET_LENGTH(input);
8471 i = 0;
8472
8473 if (size == 0) {
8474 Py_INCREF(input);
8475 return input;
8476 }
8477
8478 /* allocate enough for a simple 1:1 translation without
8479 replacements, if we need more, we'll resize */
8480 osize = size;
8481 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8482 opos = 0;
8483 if (output == NULL) {
8484 PyErr_NoMemory();
8485 goto onError;
8486 }
8487
8488 while (i<size) {
8489 /* try to encode it */
8490 PyObject *x = NULL;
8491 if (charmaptranslate_output(input, i, mapping,
8492 &output, &osize, &opos, &x)) {
8493 Py_XDECREF(x);
8494 goto onError;
8495 }
8496 Py_XDECREF(x);
8497 if (x!=Py_None) /* it worked => adjust input pointer */
8498 ++i;
8499 else { /* untranslatable character */
8500 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8501 Py_ssize_t repsize;
8502 Py_ssize_t newpos;
8503 Py_ssize_t uni2;
8504 /* startpos for collecting untranslatable chars */
8505 Py_ssize_t collstart = i;
8506 Py_ssize_t collend = i+1;
8507 Py_ssize_t coll;
8508
8509 /* find all untranslatable characters */
8510 while (collend < size) {
8511 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8512 goto onError;
8513 Py_XDECREF(x);
8514 if (x!=Py_None)
8515 break;
8516 ++collend;
8517 }
8518 /* cache callback name lookup
8519 * (if not done yet, i.e. it's the first error) */
8520 if (known_errorHandler==-1) {
8521 if ((errors==NULL) || (!strcmp(errors, "strict")))
8522 known_errorHandler = 1;
8523 else if (!strcmp(errors, "replace"))
8524 known_errorHandler = 2;
8525 else if (!strcmp(errors, "ignore"))
8526 known_errorHandler = 3;
8527 else if (!strcmp(errors, "xmlcharrefreplace"))
8528 known_errorHandler = 4;
8529 else
8530 known_errorHandler = 0;
8531 }
8532 switch (known_errorHandler) {
8533 case 1: /* strict */
8534 raise_translate_exception(&exc, input, collstart,
8535 collend, reason);
8536 goto onError;
8537 case 2: /* replace */
8538 /* No need to check for space, this is a 1:1 replacement */
8539 for (coll = collstart; coll<collend; coll++)
8540 output[opos++] = '?';
8541 /* fall through */
8542 case 3: /* ignore */
8543 i = collend;
8544 break;
8545 case 4: /* xmlcharrefreplace */
8546 /* generate replacement (temporarily (mis)uses i) */
8547 for (i = collstart; i < collend; ++i) {
8548 char buffer[2+29+1+1];
8549 char *cp;
8550 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8551 if (charmaptranslate_makespace(&output, &osize,
8552 opos+strlen(buffer)+(size-collend)))
8553 goto onError;
8554 for (cp = buffer; *cp; ++cp)
8555 output[opos++] = *cp;
8556 }
8557 i = collend;
8558 break;
8559 default:
8560 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8561 reason, input, &exc,
8562 collstart, collend, &newpos);
8563 if (repunicode == NULL)
8564 goto onError;
8565 if (PyUnicode_READY(repunicode) < 0) {
8566 Py_DECREF(repunicode);
8567 goto onError;
8568 }
8569 /* generate replacement */
8570 repsize = PyUnicode_GET_LENGTH(repunicode);
8571 if (charmaptranslate_makespace(&output, &osize,
8572 opos+repsize+(size-collend))) {
8573 Py_DECREF(repunicode);
8574 goto onError;
8575 }
8576 for (uni2 = 0; repsize-->0; ++uni2)
8577 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8578 i = newpos;
8579 Py_DECREF(repunicode);
8580 }
8581 }
8582 }
8583 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8584 if (!res)
8585 goto onError;
8586 PyMem_Free(output);
8587 Py_XDECREF(exc);
8588 Py_XDECREF(errorHandler);
8589 return res;
8590
8591 onError:
8592 PyMem_Free(output);
8593 Py_XDECREF(exc);
8594 Py_XDECREF(errorHandler);
8595 return NULL;
8596 }
8597
8598 /* Deprecated. Use PyUnicode_Translate instead. */
8599 PyObject *
8600 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8601 Py_ssize_t size,
8602 PyObject *mapping,
8603 const char *errors)
8604 {
8605 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8606 if (!unicode)
8607 return NULL;
8608 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8609 }
8610
8611 PyObject *
8612 PyUnicode_Translate(PyObject *str,
8613 PyObject *mapping,
8614 const char *errors)
8615 {
8616 PyObject *result;
8617
8618 str = PyUnicode_FromObject(str);
8619 if (str == NULL)
8620 goto onError;
8621 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8622 Py_DECREF(str);
8623 return result;
8624
8625 onError:
8626 Py_XDECREF(str);
8627 return NULL;
8628 }
8629
8630 static Py_UCS4
8631 fix_decimal_and_space_to_ascii(PyObject *self)
8632 {
8633 /* No need to call PyUnicode_READY(self) because this function is only
8634 called as a callback from fixup() which does it already. */
8635 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8636 const int kind = PyUnicode_KIND(self);
8637 void *data = PyUnicode_DATA(self);
8638 Py_UCS4 maxchar = 0, ch, fixed;
8639 Py_ssize_t i;
8640
8641 for (i = 0; i < len; ++i) {
8642 ch = PyUnicode_READ(kind, data, i);
8643 fixed = 0;
8644 if (ch > 127) {
8645 if (Py_UNICODE_ISSPACE(ch))
8646 fixed = ' ';
8647 else {
8648 const int decimal = Py_UNICODE_TODECIMAL(ch);
8649 if (decimal >= 0)
8650 fixed = '0' + decimal;
8651 }
8652 if (fixed != 0) {
8653 if (fixed > maxchar)
8654 maxchar = fixed;
8655 PyUnicode_WRITE(kind, data, i, fixed);
8656 }
8657 else if (ch > maxchar)
8658 maxchar = ch;
8659 }
8660 else if (ch > maxchar)
8661 maxchar = ch;
8662 }
8663
8664 return maxchar;
8665 }
8666
8667 PyObject *
8668 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8669 {
8670 if (!PyUnicode_Check(unicode)) {
8671 PyErr_BadInternalCall();
8672 return NULL;
8673 }
8674 if (PyUnicode_READY(unicode) == -1)
8675 return NULL;
8676 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8677 /* If the string is already ASCII, just return the same string */
8678 Py_INCREF(unicode);
8679 return unicode;
8680 }
8681 return fixup(unicode, fix_decimal_and_space_to_ascii);
8682 }
8683
8684 PyObject *
8685 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8686 Py_ssize_t length)
8687 {
8688 PyObject *decimal;
8689 Py_ssize_t i;
8690 Py_UCS4 maxchar;
8691 enum PyUnicode_Kind kind;
8692 void *data;
8693
8694 maxchar = 0;
8695 for (i = 0; i < length; i++) {
8696 Py_UNICODE ch = s[i];
8697 if (ch > 127) {
8698 int decimal = Py_UNICODE_TODECIMAL(ch);
8699 if (decimal >= 0)
8700 ch = '0' + decimal;
8701 }
8702 maxchar = Py_MAX(maxchar, ch);
8703 }
8704
8705 /* Copy to a new string */
8706 decimal = PyUnicode_New(length, maxchar);
8707 if (decimal == NULL)
8708 return decimal;
8709 kind = PyUnicode_KIND(decimal);
8710 data = PyUnicode_DATA(decimal);
8711 /* Iterate over code points */
8712 for (i = 0; i < length; i++) {
8713 Py_UNICODE ch = s[i];
8714 if (ch > 127) {
8715 int decimal = Py_UNICODE_TODECIMAL(ch);
8716 if (decimal >= 0)
8717 ch = '0' + decimal;
8718 }
8719 PyUnicode_WRITE(kind, data, i, ch);
8720 }
8721 return unicode_result(decimal);
8722 }
8723 /* --- Decimal Encoder ---------------------------------------------------- */
8724
8725 int
8726 PyUnicode_EncodeDecimal(Py_UNICODE *s,
8727 Py_ssize_t length,
8728 char *output,
8729 const char *errors)
8730 {
8731 PyObject *unicode;
8732 Py_ssize_t i;
8733 enum PyUnicode_Kind kind;
8734 void *data;
8735
8736 if (output == NULL) {
8737 PyErr_BadArgument();
8738 return -1;
8739 }
8740
8741 unicode = PyUnicode_FromUnicode(s, length);
8742 if (unicode == NULL)
8743 return -1;
8744
8745 if (PyUnicode_READY(unicode) < 0) {
8746 Py_DECREF(unicode);
8747 return -1;
8748 }
8749 kind = PyUnicode_KIND(unicode);
8750 data = PyUnicode_DATA(unicode);
8751
8752 for (i=0; i < length; ) {
8753 PyObject *exc;
8754 Py_UCS4 ch;
8755 int decimal;
8756 Py_ssize_t startpos;
8757
8758 ch = PyUnicode_READ(kind, data, i);
8759
8760 if (Py_UNICODE_ISSPACE(ch)) {
8761 *output++ = ' ';
8762 i++;
8763 continue;
8764 }
8765 decimal = Py_UNICODE_TODECIMAL(ch);
8766 if (decimal >= 0) {
8767 *output++ = '0' + decimal;
8768 i++;
8769 continue;
8770 }
8771 if (0 < ch && ch < 256) {
8772 *output++ = (char)ch;
8773 i++;
8774 continue;
8775 }
8776
8777 startpos = i;
8778 exc = NULL;
8779 raise_encode_exception(&exc, "decimal", unicode,
8780 startpos, startpos+1,
8781 "invalid decimal Unicode string");
8782 Py_XDECREF(exc);
8783 Py_DECREF(unicode);
8784 return -1;
8785 }
8786 /* 0-terminate the output string */
8787 *output++ = '\0';
8788 Py_DECREF(unicode);
8789 return 0;
8790 }
8791
8792 /* --- Helpers ------------------------------------------------------------ */
8793
8794 static Py_ssize_t
8795 any_find_slice(int direction, PyObject* s1, PyObject* s2,
8796 Py_ssize_t start,
8797 Py_ssize_t end)
8798 {
8799 int kind1, kind2, kind;
8800 void *buf1, *buf2;
8801 Py_ssize_t len1, len2, result;
8802
8803 kind1 = PyUnicode_KIND(s1);
8804 kind2 = PyUnicode_KIND(s2);
8805 kind = kind1 > kind2 ? kind1 : kind2;
8806 buf1 = PyUnicode_DATA(s1);
8807 buf2 = PyUnicode_DATA(s2);
8808 if (kind1 != kind)
8809 buf1 = _PyUnicode_AsKind(s1, kind);
8810 if (!buf1)
8811 return -2;
8812 if (kind2 != kind)
8813 buf2 = _PyUnicode_AsKind(s2, kind);
8814 if (!buf2) {
8815 if (kind1 != kind) PyMem_Free(buf1);
8816 return -2;
8817 }
8818 len1 = PyUnicode_GET_LENGTH(s1);
8819 len2 = PyUnicode_GET_LENGTH(s2);
8820
8821 if (direction > 0) {
8822 switch(kind) {
8823 case PyUnicode_1BYTE_KIND:
8824 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8825 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8826 else
8827 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8828 break;
8829 case PyUnicode_2BYTE_KIND:
8830 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8831 break;
8832 case PyUnicode_4BYTE_KIND:
8833 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8834 break;
8835 default:
8836 assert(0); result = -2;
8837 }
8838 }
8839 else {
8840 switch(kind) {
8841 case PyUnicode_1BYTE_KIND:
8842 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8843 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8844 else
8845 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8846 break;
8847 case PyUnicode_2BYTE_KIND:
8848 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8849 break;
8850 case PyUnicode_4BYTE_KIND:
8851 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8852 break;
8853 default:
8854 assert(0); result = -2;
8855 }
8856 }
8857
8858 if (kind1 != kind)
8859 PyMem_Free(buf1);
8860 if (kind2 != kind)
8861 PyMem_Free(buf2);
8862
8863 return result;
8864 }
8865
8866 Py_ssize_t
8867 _PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
8868 Py_ssize_t n_buffer,
8869 void *digits, Py_ssize_t n_digits,
8870 Py_ssize_t min_width,
8871 const char *grouping,
8872 const char *thousands_sep)
8873 {
8874 switch(kind) {
8875 case PyUnicode_1BYTE_KIND:
8876 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8877 return _PyUnicode_ascii_InsertThousandsGrouping(
8878 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8879 min_width, grouping, thousands_sep);
8880 else
8881 return _PyUnicode_ucs1_InsertThousandsGrouping(
8882 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8883 min_width, grouping, thousands_sep);
8884 case PyUnicode_2BYTE_KIND:
8885 return _PyUnicode_ucs2_InsertThousandsGrouping(
8886 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8887 min_width, grouping, thousands_sep);
8888 case PyUnicode_4BYTE_KIND:
8889 return _PyUnicode_ucs4_InsertThousandsGrouping(
8890 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8891 min_width, grouping, thousands_sep);
8892 }
8893 assert(0);
8894 return -1;
8895 }
8896
8897
8898 /* helper macro to fixup start/end slice values */
8899 #define ADJUST_INDICES(start, end, len) \
8900 if (end > len) \
8901 end = len; \
8902 else if (end < 0) { \
8903 end += len; \
8904 if (end < 0) \
8905 end = 0; \
8906 } \
8907 if (start < 0) { \
8908 start += len; \
8909 if (start < 0) \
8910 start = 0; \
8911 }
8912
8913 Py_ssize_t
8914 PyUnicode_Count(PyObject *str,
8915 PyObject *substr,
8916 Py_ssize_t start,
8917 Py_ssize_t end)
8918 {
8919 Py_ssize_t result;
8920 PyObject* str_obj;
8921 PyObject* sub_obj;
8922 int kind1, kind2, kind;
8923 void *buf1 = NULL, *buf2 = NULL;
8924 Py_ssize_t len1, len2;
8925
8926 str_obj = PyUnicode_FromObject(str);
8927 if (!str_obj || PyUnicode_READY(str_obj) == -1)
8928 return -1;
8929 sub_obj = PyUnicode_FromObject(substr);
8930 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
8931 Py_DECREF(str_obj);
8932 return -1;
8933 }
8934
8935 kind1 = PyUnicode_KIND(str_obj);
8936 kind2 = PyUnicode_KIND(sub_obj);
8937 kind = kind1 > kind2 ? kind1 : kind2;
8938 buf1 = PyUnicode_DATA(str_obj);
8939 if (kind1 != kind)
8940 buf1 = _PyUnicode_AsKind(str_obj, kind);
8941 if (!buf1)
8942 goto onError;
8943 buf2 = PyUnicode_DATA(sub_obj);
8944 if (kind2 != kind)
8945 buf2 = _PyUnicode_AsKind(sub_obj, kind);
8946 if (!buf2)
8947 goto onError;
8948 len1 = PyUnicode_GET_LENGTH(str_obj);
8949 len2 = PyUnicode_GET_LENGTH(sub_obj);
8950
8951 ADJUST_INDICES(start, end, len1);
8952 switch(kind) {
8953 case PyUnicode_1BYTE_KIND:
8954 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8955 result = asciilib_count(
8956 ((Py_UCS1*)buf1) + start, end - start,
8957 buf2, len2, PY_SSIZE_T_MAX
8958 );
8959 else
8960 result = ucs1lib_count(
8961 ((Py_UCS1*)buf1) + start, end - start,
8962 buf2, len2, PY_SSIZE_T_MAX
8963 );
8964 break;
8965 case PyUnicode_2BYTE_KIND:
8966 result = ucs2lib_count(
8967 ((Py_UCS2*)buf1) + start, end - start,
8968 buf2, len2, PY_SSIZE_T_MAX
8969 );
8970 break;
8971 case PyUnicode_4BYTE_KIND:
8972 result = ucs4lib_count(
8973 ((Py_UCS4*)buf1) + start, end - start,
8974 buf2, len2, PY_SSIZE_T_MAX
8975 );
8976 break;
8977 default:
8978 assert(0); result = 0;
8979 }
8980
8981 Py_DECREF(sub_obj);
8982 Py_DECREF(str_obj);
8983
8984 if (kind1 != kind)
8985 PyMem_Free(buf1);
8986 if (kind2 != kind)
8987 PyMem_Free(buf2);
8988
8989 return result;
8990 onError:
8991 Py_DECREF(sub_obj);
8992 Py_DECREF(str_obj);
8993 if (kind1 != kind && buf1)
8994 PyMem_Free(buf1);
8995 if (kind2 != kind && buf2)
8996 PyMem_Free(buf2);
8997 return -1;
8998 }
8999
9000 Py_ssize_t
9001 PyUnicode_Find(PyObject *str,
9002 PyObject *sub,
9003 Py_ssize_t start,
9004 Py_ssize_t end,
9005 int direction)
9006 {
9007 Py_ssize_t result;
9008
9009 str = PyUnicode_FromObject(str);
9010 if (!str || PyUnicode_READY(str) == -1)
9011 return -2;
9012 sub = PyUnicode_FromObject(sub);
9013 if (!sub || PyUnicode_READY(sub) == -1) {
9014 Py_DECREF(str);
9015 return -2;
9016 }
9017
9018 result = any_find_slice(direction,
9019 str, sub, start, end
9020 );
9021
9022 Py_DECREF(str);
9023 Py_DECREF(sub);
9024
9025 return result;
9026 }
9027
9028 Py_ssize_t
9029 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9030 Py_ssize_t start, Py_ssize_t end,
9031 int direction)
9032 {
9033 int kind;
9034 Py_ssize_t result;
9035 if (PyUnicode_READY(str) == -1)
9036 return -2;
9037 if (start < 0 || end < 0) {
9038 PyErr_SetString(PyExc_IndexError, "string index out of range");
9039 return -2;
9040 }
9041 if (end > PyUnicode_GET_LENGTH(str))
9042 end = PyUnicode_GET_LENGTH(str);
9043 kind = PyUnicode_KIND(str);
9044 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9045 kind, end-start, ch, direction);
9046 if (result == -1)
9047 return -1;
9048 else
9049 return start + result;
9050 }
9051
9052 static int
9053 tailmatch(PyObject *self,
9054 PyObject *substring,
9055 Py_ssize_t start,
9056 Py_ssize_t end,
9057 int direction)
9058 {
9059 int kind_self;
9060 int kind_sub;
9061 void *data_self;
9062 void *data_sub;
9063 Py_ssize_t offset;
9064 Py_ssize_t i;
9065 Py_ssize_t end_sub;
9066
9067 if (PyUnicode_READY(self) == -1 ||
9068 PyUnicode_READY(substring) == -1)
9069 return 0;
9070
9071 if (PyUnicode_GET_LENGTH(substring) == 0)
9072 return 1;
9073
9074 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9075 end -= PyUnicode_GET_LENGTH(substring);
9076 if (end < start)
9077 return 0;
9078
9079 kind_self = PyUnicode_KIND(self);
9080 data_self = PyUnicode_DATA(self);
9081 kind_sub = PyUnicode_KIND(substring);
9082 data_sub = PyUnicode_DATA(substring);
9083 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9084
9085 if (direction > 0)
9086 offset = end;
9087 else
9088 offset = start;
9089
9090 if (PyUnicode_READ(kind_self, data_self, offset) ==
9091 PyUnicode_READ(kind_sub, data_sub, 0) &&
9092 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9093 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9094 /* If both are of the same kind, memcmp is sufficient */
9095 if (kind_self == kind_sub) {
9096 return ! memcmp((char *)data_self +
9097 (offset * PyUnicode_KIND(substring)),
9098 data_sub,
9099 PyUnicode_GET_LENGTH(substring) *
9100 PyUnicode_KIND(substring));
9101 }
9102 /* otherwise we have to compare each character by first accesing it */
9103 else {
9104 /* We do not need to compare 0 and len(substring)-1 because
9105 the if statement above ensured already that they are equal
9106 when we end up here. */
9107 // TODO: honor direction and do a forward or backwards search
9108 for (i = 1; i < end_sub; ++i) {
9109 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9110 PyUnicode_READ(kind_sub, data_sub, i))
9111 return 0;
9112 }
9113 return 1;
9114 }
9115 }
9116
9117 return 0;
9118 }
9119
9120 Py_ssize_t
9121 PyUnicode_Tailmatch(PyObject *str,
9122 PyObject *substr,
9123 Py_ssize_t start,
9124 Py_ssize_t end,
9125 int direction)
9126 {
9127 Py_ssize_t result;
9128
9129 str = PyUnicode_FromObject(str);
9130 if (str == NULL)
9131 return -1;
9132 substr = PyUnicode_FromObject(substr);
9133 if (substr == NULL) {
9134 Py_DECREF(str);
9135 return -1;
9136 }
9137
9138 result = tailmatch(str, substr,
9139 start, end, direction);
9140 Py_DECREF(str);
9141 Py_DECREF(substr);
9142 return result;
9143 }
9144
9145 /* Apply fixfct filter to the Unicode object self and return a
9146 reference to the modified object */
9147
9148 static PyObject *
9149 fixup(PyObject *self,
9150 Py_UCS4 (*fixfct)(PyObject *s))
9151 {
9152 PyObject *u;
9153 Py_UCS4 maxchar_old, maxchar_new = 0;
9154
9155 u = PyUnicode_Copy(self);
9156 if (u == NULL)
9157 return NULL;
9158 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9159
9160 /* fix functions return the new maximum character in a string,
9161 if the kind of the resulting unicode object does not change,
9162 everything is fine. Otherwise we need to change the string kind
9163 and re-run the fix function. */
9164 maxchar_new = fixfct(u);
9165 if (maxchar_new == 0)
9166 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9167 else if (maxchar_new <= 127)
9168 maxchar_new = 127;
9169 else if (maxchar_new <= 255)
9170 maxchar_new = 255;
9171 else if (maxchar_new <= 65535)
9172 maxchar_new = 65535;
9173 else
9174 maxchar_new = MAX_UNICODE;
9175
9176 if (!maxchar_new && PyUnicode_CheckExact(self)) {
9177 /* fixfct should return TRUE if it modified the buffer. If
9178 FALSE, return a reference to the original buffer instead
9179 (to save space, not time) */
9180 Py_INCREF(self);
9181 Py_DECREF(u);
9182 return self;
9183 }
9184 else if (maxchar_new == maxchar_old) {
9185 return u;
9186 }
9187 else {
9188 /* In case the maximum character changed, we need to
9189 convert the string to the new category. */
9190 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9191 if (v == NULL) {
9192 Py_DECREF(u);
9193 return NULL;
9194 }
9195 if (maxchar_new > maxchar_old) {
9196 /* If the maxchar increased so that the kind changed, not all
9197 characters are representable anymore and we need to fix the
9198 string again. This only happens in very few cases. */
9199 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9200 maxchar_old = fixfct(v);
9201 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9202 }
9203 else {
9204 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
9205 }
9206
9207 Py_DECREF(u);
9208 assert(_PyUnicode_CheckConsistency(v, 1));
9209 return v;
9210 }
9211 }
9212
9213 static Py_UCS4
9214 fixupper(PyObject *self)
9215 {
9216 /* No need to call PyUnicode_READY(self) because this function is only
9217 called as a callback from fixup() which does it already. */
9218 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9219 const int kind = PyUnicode_KIND(self);
9220 void *data = PyUnicode_DATA(self);
9221 int touched = 0;
9222 Py_UCS4 maxchar = 0;
9223 Py_ssize_t i;
9224
9225 for (i = 0; i < len; ++i) {
9226 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9227 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9228 if (up != ch) {
9229 if (up > maxchar)
9230 maxchar = up;
9231 PyUnicode_WRITE(kind, data, i, up);
9232 touched = 1;
9233 }
9234 else if (ch > maxchar)
9235 maxchar = ch;
9236 }
9237
9238 if (touched)
9239 return maxchar;
9240 else
9241 return 0;
9242 }
9243
9244 static Py_UCS4
9245 fixlower(PyObject *self)
9246 {
9247 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9248 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9249 const int kind = PyUnicode_KIND(self);
9250 void *data = PyUnicode_DATA(self);
9251 int touched = 0;
9252 Py_UCS4 maxchar = 0;
9253 Py_ssize_t i;
9254
9255 for(i = 0; i < len; ++i) {
9256 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9257 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9258 if (lo != ch) {
9259 if (lo > maxchar)
9260 maxchar = lo;
9261 PyUnicode_WRITE(kind, data, i, lo);
9262 touched = 1;
9263 }
9264 else if (ch > maxchar)
9265 maxchar = ch;
9266 }
9267
9268 if (touched)
9269 return maxchar;
9270 else
9271 return 0;
9272 }
9273
9274 static Py_UCS4
9275 fixswapcase(PyObject *self)
9276 {
9277 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9278 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9279 const int kind = PyUnicode_KIND(self);
9280 void *data = PyUnicode_DATA(self);
9281 int touched = 0;
9282 Py_UCS4 maxchar = 0;
9283 Py_ssize_t i;
9284
9285 for(i = 0; i < len; ++i) {
9286 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9287 Py_UCS4 nu = 0;
9288
9289 if (Py_UNICODE_ISUPPER(ch))
9290 nu = Py_UNICODE_TOLOWER(ch);
9291 else if (Py_UNICODE_ISLOWER(ch))
9292 nu = Py_UNICODE_TOUPPER(ch);
9293
9294 if (nu != 0) {
9295 if (nu > maxchar)
9296 maxchar = nu;
9297 PyUnicode_WRITE(kind, data, i, nu);
9298 touched = 1;
9299 }
9300 else if (ch > maxchar)
9301 maxchar = ch;
9302 }
9303
9304 if (touched)
9305 return maxchar;
9306 else
9307 return 0;
9308 }
9309
9310 static Py_UCS4
9311 fixcapitalize(PyObject *self)
9312 {
9313 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9314 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9315 const int kind = PyUnicode_KIND(self);
9316 void *data = PyUnicode_DATA(self);
9317 int touched = 0;
9318 Py_UCS4 maxchar = 0;
9319 Py_ssize_t i = 0;
9320 Py_UCS4 ch;
9321
9322 if (len == 0)
9323 return 0;
9324
9325 ch = PyUnicode_READ(kind, data, i);
9326 if (!Py_UNICODE_ISUPPER(ch)) {
9327 maxchar = Py_UNICODE_TOUPPER(ch);
9328 PyUnicode_WRITE(kind, data, i, maxchar);
9329 touched = 1;
9330 }
9331 ++i;
9332 for(; i < len; ++i) {
9333 ch = PyUnicode_READ(kind, data, i);
9334 if (!Py_UNICODE_ISLOWER(ch)) {
9335 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9336 if (lo > maxchar)
9337 maxchar = lo;
9338 PyUnicode_WRITE(kind, data, i, lo);
9339 touched = 1;
9340 }
9341 else if (ch > maxchar)
9342 maxchar = ch;
9343 }
9344
9345 if (touched)
9346 return maxchar;
9347 else
9348 return 0;
9349 }
9350
9351 static Py_UCS4
9352 fixtitle(PyObject *self)
9353 {
9354 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9355 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9356 const int kind = PyUnicode_KIND(self);
9357 void *data = PyUnicode_DATA(self);
9358 Py_UCS4 maxchar = 0;
9359 Py_ssize_t i = 0;
9360 int previous_is_cased;
9361
9362 /* Shortcut for single character strings */
9363 if (len == 1) {
9364 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9365 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9366 if (ti != ch) {
9367 PyUnicode_WRITE(kind, data, i, ti);
9368 return ti;
9369 }
9370 else
9371 return 0;
9372 }
9373 previous_is_cased = 0;
9374 for(; i < len; ++i) {
9375 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9376 Py_UCS4 nu;
9377
9378 if (previous_is_cased)
9379 nu = Py_UNICODE_TOLOWER(ch);
9380 else
9381 nu = Py_UNICODE_TOTITLE(ch);
9382
9383 if (nu > maxchar)
9384 maxchar = nu;
9385 PyUnicode_WRITE(kind, data, i, nu);
9386
9387 if (Py_UNICODE_ISLOWER(ch) ||
9388 Py_UNICODE_ISUPPER(ch) ||
9389 Py_UNICODE_ISTITLE(ch))
9390 previous_is_cased = 1;
9391 else
9392 previous_is_cased = 0;
9393 }
9394 return maxchar;
9395 }
9396
9397 PyObject *
9398 PyUnicode_Join(PyObject *separator, PyObject *seq)
9399 {
9400 PyObject *sep = NULL;
9401 Py_ssize_t seplen;
9402 PyObject *res = NULL; /* the result */
9403 PyObject *fseq; /* PySequence_Fast(seq) */
9404 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9405 PyObject **items;
9406 PyObject *item;
9407 Py_ssize_t sz, i, res_offset;
9408 Py_UCS4 maxchar;
9409 Py_UCS4 item_maxchar;
9410 int use_memcpy;
9411 unsigned char *res_data = NULL, *sep_data = NULL;
9412 PyObject *last_obj;
9413 unsigned int kind = 0;
9414
9415 fseq = PySequence_Fast(seq, "");
9416 if (fseq == NULL) {
9417 return NULL;
9418 }
9419
9420 /* NOTE: the following code can't call back into Python code,
9421 * so we are sure that fseq won't be mutated.
9422 */
9423
9424 seqlen = PySequence_Fast_GET_SIZE(fseq);
9425 /* If empty sequence, return u"". */
9426 if (seqlen == 0) {
9427 Py_DECREF(fseq);
9428 Py_INCREF(unicode_empty);
9429 res = unicode_empty;
9430 return res;
9431 }
9432
9433 /* If singleton sequence with an exact Unicode, return that. */
9434 last_obj = NULL;
9435 items = PySequence_Fast_ITEMS(fseq);
9436 if (seqlen == 1) {
9437 if (PyUnicode_CheckExact(items[0])) {
9438 res = items[0];
9439 Py_INCREF(res);
9440 Py_DECREF(fseq);
9441 return res;
9442 }
9443 seplen = 0;
9444 maxchar = 0;
9445 }
9446 else {
9447 /* Set up sep and seplen */
9448 if (separator == NULL) {
9449 /* fall back to a blank space separator */
9450 sep = PyUnicode_FromOrdinal(' ');
9451 if (!sep)
9452 goto onError;
9453 seplen = 1;
9454 maxchar = 32;
9455 }
9456 else {
9457 if (!PyUnicode_Check(separator)) {
9458 PyErr_Format(PyExc_TypeError,
9459 "separator: expected str instance,"
9460 " %.80s found",
9461 Py_TYPE(separator)->tp_name);
9462 goto onError;
9463 }
9464 if (PyUnicode_READY(separator))
9465 goto onError;
9466 sep = separator;
9467 seplen = PyUnicode_GET_LENGTH(separator);
9468 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9469 /* inc refcount to keep this code path symmetric with the
9470 above case of a blank separator */
9471 Py_INCREF(sep);
9472 }
9473 last_obj = sep;
9474 }
9475
9476 /* There are at least two things to join, or else we have a subclass
9477 * of str in the sequence.
9478 * Do a pre-pass to figure out the total amount of space we'll
9479 * need (sz), and see whether all argument are strings.
9480 */
9481 sz = 0;
9482 #ifdef Py_DEBUG
9483 use_memcpy = 0;
9484 #else
9485 use_memcpy = 1;
9486 #endif
9487 for (i = 0; i < seqlen; i++) {
9488 const Py_ssize_t old_sz = sz;
9489 item = items[i];
9490 if (!PyUnicode_Check(item)) {
9491 PyErr_Format(PyExc_TypeError,
9492 "sequence item %zd: expected str instance,"
9493 " %.80s found",
9494 i, Py_TYPE(item)->tp_name);
9495 goto onError;
9496 }
9497 if (PyUnicode_READY(item) == -1)
9498 goto onError;
9499 sz += PyUnicode_GET_LENGTH(item);
9500 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9501 maxchar = Py_MAX(maxchar, item_maxchar);
9502 if (i != 0)
9503 sz += seplen;
9504 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9505 PyErr_SetString(PyExc_OverflowError,
9506 "join() result is too long for a Python string");
9507 goto onError;
9508 }
9509 if (use_memcpy && last_obj != NULL) {
9510 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9511 use_memcpy = 0;
9512 }
9513 last_obj = item;
9514 }
9515
9516 res = PyUnicode_New(sz, maxchar);
9517 if (res == NULL)
9518 goto onError;
9519
9520 /* Catenate everything. */
9521 #ifdef Py_DEBUG
9522 use_memcpy = 0;
9523 #else
9524 if (use_memcpy) {
9525 res_data = PyUnicode_1BYTE_DATA(res);
9526 kind = PyUnicode_KIND(res);
9527 if (seplen != 0)
9528 sep_data = PyUnicode_1BYTE_DATA(sep);
9529 }
9530 #endif
9531 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9532 Py_ssize_t itemlen;
9533 item = items[i];
9534 /* Copy item, and maybe the separator. */
9535 if (i && seplen != 0) {
9536 if (use_memcpy) {
9537 Py_MEMCPY(res_data,
9538 sep_data,
9539 kind * seplen);
9540 res_data += kind * seplen;
9541 }
9542 else {
9543 copy_characters(res, res_offset, sep, 0, seplen);
9544 res_offset += seplen;
9545 }
9546 }
9547 itemlen = PyUnicode_GET_LENGTH(item);
9548 if (itemlen != 0) {
9549 if (use_memcpy) {
9550 Py_MEMCPY(res_data,
9551 PyUnicode_DATA(item),
9552 kind * itemlen);
9553 res_data += kind * itemlen;
9554 }
9555 else {
9556 copy_characters(res, res_offset, item, 0, itemlen);
9557 res_offset += itemlen;
9558 }
9559 }
9560 }
9561 if (use_memcpy)
9562 assert(res_data == PyUnicode_1BYTE_DATA(res)
9563 + kind * PyUnicode_GET_LENGTH(res));
9564 else
9565 assert(res_offset == PyUnicode_GET_LENGTH(res));
9566
9567 Py_DECREF(fseq);
9568 Py_XDECREF(sep);
9569 assert(_PyUnicode_CheckConsistency(res, 1));
9570 return res;
9571
9572 onError:
9573 Py_DECREF(fseq);
9574 Py_XDECREF(sep);
9575 Py_XDECREF(res);
9576 return NULL;
9577 }
9578
9579 #define FILL(kind, data, value, start, length) \
9580 do { \
9581 Py_ssize_t i_ = 0; \
9582 assert(kind != PyUnicode_WCHAR_KIND); \
9583 switch ((kind)) { \
9584 case PyUnicode_1BYTE_KIND: { \
9585 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9586 memset(to_, (unsigned char)value, length); \
9587 break; \
9588 } \
9589 case PyUnicode_2BYTE_KIND: { \
9590 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9591 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9592 break; \
9593 } \
9594 default: { \
9595 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9596 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9597 break; \
9598 } \
9599 } \
9600 } while (0)
9601
9602 static PyObject *
9603 pad(PyObject *self,
9604 Py_ssize_t left,
9605 Py_ssize_t right,
9606 Py_UCS4 fill)
9607 {
9608 PyObject *u;
9609 Py_UCS4 maxchar;
9610 int kind;
9611 void *data;
9612
9613 if (left < 0)
9614 left = 0;
9615 if (right < 0)
9616 right = 0;
9617
9618 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9619 Py_INCREF(self);
9620 return self;
9621 }
9622
9623 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9624 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9625 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9626 return NULL;
9627 }
9628 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9629 if (fill > maxchar)
9630 maxchar = fill;
9631 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9632 if (!u)
9633 return NULL;
9634
9635 kind = PyUnicode_KIND(u);
9636 data = PyUnicode_DATA(u);
9637 if (left)
9638 FILL(kind, data, fill, 0, left);
9639 if (right)
9640 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9641 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
9642 assert(_PyUnicode_CheckConsistency(u, 1));
9643 return u;
9644 }
9645 #undef FILL
9646
9647 PyObject *
9648 PyUnicode_Splitlines(PyObject *string, int keepends)
9649 {
9650 PyObject *list;
9651
9652 string = PyUnicode_FromObject(string);
9653 if (string == NULL || PyUnicode_READY(string) == -1)
9654 return NULL;
9655
9656 switch(PyUnicode_KIND(string)) {
9657 case PyUnicode_1BYTE_KIND:
9658 if (PyUnicode_IS_ASCII(string))
9659 list = asciilib_splitlines(
9660 string, PyUnicode_1BYTE_DATA(string),
9661 PyUnicode_GET_LENGTH(string), keepends);
9662 else
9663 list = ucs1lib_splitlines(
9664 string, PyUnicode_1BYTE_DATA(string),
9665 PyUnicode_GET_LENGTH(string), keepends);
9666 break;
9667 case PyUnicode_2BYTE_KIND:
9668 list = ucs2lib_splitlines(
9669 string, PyUnicode_2BYTE_DATA(string),
9670 PyUnicode_GET_LENGTH(string), keepends);
9671 break;
9672 case PyUnicode_4BYTE_KIND:
9673 list = ucs4lib_splitlines(
9674 string, PyUnicode_4BYTE_DATA(string),
9675 PyUnicode_GET_LENGTH(string), keepends);
9676 break;
9677 default:
9678 assert(0);
9679 list = 0;
9680 }
9681 Py_DECREF(string);
9682 return list;
9683 }
9684
9685 static PyObject *
9686 split(PyObject *self,
9687 PyObject *substring,
9688 Py_ssize_t maxcount)
9689 {
9690 int kind1, kind2, kind;
9691 void *buf1, *buf2;
9692 Py_ssize_t len1, len2;
9693 PyObject* out;
9694
9695 if (maxcount < 0)
9696 maxcount = PY_SSIZE_T_MAX;
9697
9698 if (PyUnicode_READY(self) == -1)
9699 return NULL;
9700
9701 if (substring == NULL)
9702 switch(PyUnicode_KIND(self)) {
9703 case PyUnicode_1BYTE_KIND:
9704 if (PyUnicode_IS_ASCII(self))
9705 return asciilib_split_whitespace(
9706 self, PyUnicode_1BYTE_DATA(self),
9707 PyUnicode_GET_LENGTH(self), maxcount
9708 );
9709 else
9710 return ucs1lib_split_whitespace(
9711 self, PyUnicode_1BYTE_DATA(self),
9712 PyUnicode_GET_LENGTH(self), maxcount
9713 );
9714 case PyUnicode_2BYTE_KIND:
9715 return ucs2lib_split_whitespace(
9716 self, PyUnicode_2BYTE_DATA(self),
9717 PyUnicode_GET_LENGTH(self), maxcount
9718 );
9719 case PyUnicode_4BYTE_KIND:
9720 return ucs4lib_split_whitespace(
9721 self, PyUnicode_4BYTE_DATA(self),
9722 PyUnicode_GET_LENGTH(self), maxcount
9723 );
9724 default:
9725 assert(0);
9726 return NULL;
9727 }
9728
9729 if (PyUnicode_READY(substring) == -1)
9730 return NULL;
9731
9732 kind1 = PyUnicode_KIND(self);
9733 kind2 = PyUnicode_KIND(substring);
9734 kind = kind1 > kind2 ? kind1 : kind2;
9735 buf1 = PyUnicode_DATA(self);
9736 buf2 = PyUnicode_DATA(substring);
9737 if (kind1 != kind)
9738 buf1 = _PyUnicode_AsKind(self, kind);
9739 if (!buf1)
9740 return NULL;
9741 if (kind2 != kind)
9742 buf2 = _PyUnicode_AsKind(substring, kind);
9743 if (!buf2) {
9744 if (kind1 != kind) PyMem_Free(buf1);
9745 return NULL;
9746 }
9747 len1 = PyUnicode_GET_LENGTH(self);
9748 len2 = PyUnicode_GET_LENGTH(substring);
9749
9750 switch(kind) {
9751 case PyUnicode_1BYTE_KIND:
9752 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9753 out = asciilib_split(
9754 self, buf1, len1, buf2, len2, maxcount);
9755 else
9756 out = ucs1lib_split(
9757 self, buf1, len1, buf2, len2, maxcount);
9758 break;
9759 case PyUnicode_2BYTE_KIND:
9760 out = ucs2lib_split(
9761 self, buf1, len1, buf2, len2, maxcount);
9762 break;
9763 case PyUnicode_4BYTE_KIND:
9764 out = ucs4lib_split(
9765 self, buf1, len1, buf2, len2, maxcount);
9766 break;
9767 default:
9768 out = NULL;
9769 }
9770 if (kind1 != kind)
9771 PyMem_Free(buf1);
9772 if (kind2 != kind)
9773 PyMem_Free(buf2);
9774 return out;
9775 }
9776
9777 static PyObject *
9778 rsplit(PyObject *self,
9779 PyObject *substring,
9780 Py_ssize_t maxcount)
9781 {
9782 int kind1, kind2, kind;
9783 void *buf1, *buf2;
9784 Py_ssize_t len1, len2;
9785 PyObject* out;
9786
9787 if (maxcount < 0)
9788 maxcount = PY_SSIZE_T_MAX;
9789
9790 if (PyUnicode_READY(self) == -1)
9791 return NULL;
9792
9793 if (substring == NULL)
9794 switch(PyUnicode_KIND(self)) {
9795 case PyUnicode_1BYTE_KIND:
9796 if (PyUnicode_IS_ASCII(self))
9797 return asciilib_rsplit_whitespace(
9798 self, PyUnicode_1BYTE_DATA(self),
9799 PyUnicode_GET_LENGTH(self), maxcount
9800 );
9801 else
9802 return ucs1lib_rsplit_whitespace(
9803 self, PyUnicode_1BYTE_DATA(self),
9804 PyUnicode_GET_LENGTH(self), maxcount
9805 );
9806 case PyUnicode_2BYTE_KIND:
9807 return ucs2lib_rsplit_whitespace(
9808 self, PyUnicode_2BYTE_DATA(self),
9809 PyUnicode_GET_LENGTH(self), maxcount
9810 );
9811 case PyUnicode_4BYTE_KIND:
9812 return ucs4lib_rsplit_whitespace(
9813 self, PyUnicode_4BYTE_DATA(self),
9814 PyUnicode_GET_LENGTH(self), maxcount
9815 );
9816 default:
9817 assert(0);
9818 return NULL;
9819 }
9820
9821 if (PyUnicode_READY(substring) == -1)
9822 return NULL;
9823
9824 kind1 = PyUnicode_KIND(self);
9825 kind2 = PyUnicode_KIND(substring);
9826 kind = kind1 > kind2 ? kind1 : kind2;
9827 buf1 = PyUnicode_DATA(self);
9828 buf2 = PyUnicode_DATA(substring);
9829 if (kind1 != kind)
9830 buf1 = _PyUnicode_AsKind(self, kind);
9831 if (!buf1)
9832 return NULL;
9833 if (kind2 != kind)
9834 buf2 = _PyUnicode_AsKind(substring, kind);
9835 if (!buf2) {
9836 if (kind1 != kind) PyMem_Free(buf1);
9837 return NULL;
9838 }
9839 len1 = PyUnicode_GET_LENGTH(self);
9840 len2 = PyUnicode_GET_LENGTH(substring);
9841
9842 switch(kind) {
9843 case PyUnicode_1BYTE_KIND:
9844 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9845 out = asciilib_rsplit(
9846 self, buf1, len1, buf2, len2, maxcount);
9847 else
9848 out = ucs1lib_rsplit(
9849 self, buf1, len1, buf2, len2, maxcount);
9850 break;
9851 case PyUnicode_2BYTE_KIND:
9852 out = ucs2lib_rsplit(
9853 self, buf1, len1, buf2, len2, maxcount);
9854 break;
9855 case PyUnicode_4BYTE_KIND:
9856 out = ucs4lib_rsplit(
9857 self, buf1, len1, buf2, len2, maxcount);
9858 break;
9859 default:
9860 out = NULL;
9861 }
9862 if (kind1 != kind)
9863 PyMem_Free(buf1);
9864 if (kind2 != kind)
9865 PyMem_Free(buf2);
9866 return out;
9867 }
9868
9869 static Py_ssize_t
9870 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9871 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9872 {
9873 switch(kind) {
9874 case PyUnicode_1BYTE_KIND:
9875 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9876 return asciilib_find(buf1, len1, buf2, len2, offset);
9877 else
9878 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9879 case PyUnicode_2BYTE_KIND:
9880 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9881 case PyUnicode_4BYTE_KIND:
9882 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9883 }
9884 assert(0);
9885 return -1;
9886 }
9887
9888 static Py_ssize_t
9889 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9890 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9891 {
9892 switch(kind) {
9893 case PyUnicode_1BYTE_KIND:
9894 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9895 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9896 else
9897 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9898 case PyUnicode_2BYTE_KIND:
9899 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9900 case PyUnicode_4BYTE_KIND:
9901 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9902 }
9903 assert(0);
9904 return 0;
9905 }
9906
9907 static PyObject *
9908 replace(PyObject *self, PyObject *str1,
9909 PyObject *str2, Py_ssize_t maxcount)
9910 {
9911 PyObject *u;
9912 char *sbuf = PyUnicode_DATA(self);
9913 char *buf1 = PyUnicode_DATA(str1);
9914 char *buf2 = PyUnicode_DATA(str2);
9915 int srelease = 0, release1 = 0, release2 = 0;
9916 int skind = PyUnicode_KIND(self);
9917 int kind1 = PyUnicode_KIND(str1);
9918 int kind2 = PyUnicode_KIND(str2);
9919 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9920 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9921 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9922 int mayshrink;
9923 Py_UCS4 maxchar, maxchar_str2;
9924
9925 if (maxcount < 0)
9926 maxcount = PY_SSIZE_T_MAX;
9927 else if (maxcount == 0 || slen == 0)
9928 goto nothing;
9929
9930 if (str1 == str2)
9931 goto nothing;
9932 if (skind < kind1)
9933 /* substring too wide to be present */
9934 goto nothing;
9935
9936 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9937 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9938 /* Replacing str1 with str2 may cause a maxchar reduction in the
9939 result string. */
9940 mayshrink = (maxchar_str2 < maxchar);
9941 maxchar = Py_MAX(maxchar, maxchar_str2);
9942
9943 if (len1 == len2) {
9944 Py_ssize_t i;
9945 /* same length */
9946 if (len1 == 0)
9947 goto nothing;
9948 if (len1 == 1) {
9949 /* replace characters */
9950 Py_UCS4 u1, u2;
9951 int rkind;
9952 u1 = PyUnicode_READ_CHAR(str1, 0);
9953 if (findchar(sbuf, PyUnicode_KIND(self),
9954 slen, u1, 1) < 0)
9955 goto nothing;
9956 u2 = PyUnicode_READ_CHAR(str2, 0);
9957 u = PyUnicode_New(slen, maxchar);
9958 if (!u)
9959 goto error;
9960 copy_characters(u, 0, self, 0, slen);
9961 rkind = PyUnicode_KIND(u);
9962 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9963 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
9964 if (--maxcount < 0)
9965 break;
9966 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
9967 }
9968 }
9969 else {
9970 int rkind = skind;
9971 char *res;
9972
9973 if (kind1 < rkind) {
9974 /* widen substring */
9975 buf1 = _PyUnicode_AsKind(str1, rkind);
9976 if (!buf1) goto error;
9977 release1 = 1;
9978 }
9979 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
9980 if (i < 0)
9981 goto nothing;
9982 if (rkind > kind2) {
9983 /* widen replacement */
9984 buf2 = _PyUnicode_AsKind(str2, rkind);
9985 if (!buf2) goto error;
9986 release2 = 1;
9987 }
9988 else if (rkind < kind2) {
9989 /* widen self and buf1 */
9990 rkind = kind2;
9991 if (release1) PyMem_Free(buf1);
9992 sbuf = _PyUnicode_AsKind(self, rkind);
9993 if (!sbuf) goto error;
9994 srelease = 1;
9995 buf1 = _PyUnicode_AsKind(str1, rkind);
9996 if (!buf1) goto error;
9997 release1 = 1;
9998 }
9999 u = PyUnicode_New(slen, maxchar);
10000 if (!u)
10001 goto error;
10002 assert(PyUnicode_KIND(u) == rkind);
10003 res = PyUnicode_DATA(u);
10004
10005 memcpy(res, sbuf, rkind * slen);
10006 /* change everything in-place, starting with this one */
10007 memcpy(res + rkind * i,
10008 buf2,
10009 rkind * len2);
10010 i += len1;
10011
10012 while ( --maxcount > 0) {
10013 i = anylib_find(rkind, self,
10014 sbuf+rkind*i, slen-i,
10015 str1, buf1, len1, i);
10016 if (i == -1)
10017 break;
10018 memcpy(res + rkind * i,
10019 buf2,
10020 rkind * len2);
10021 i += len1;
10022 }
10023 }
10024 }
10025 else {
10026 Py_ssize_t n, i, j, ires;
10027 Py_ssize_t product, new_size;
10028 int rkind = skind;
10029 char *res;
10030
10031 if (kind1 < rkind) {
10032 /* widen substring */
10033 buf1 = _PyUnicode_AsKind(str1, rkind);
10034 if (!buf1) goto error;
10035 release1 = 1;
10036 }
10037 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10038 if (n == 0)
10039 goto nothing;
10040 if (kind2 < rkind) {
10041 /* widen replacement */
10042 buf2 = _PyUnicode_AsKind(str2, rkind);
10043 if (!buf2) goto error;
10044 release2 = 1;
10045 }
10046 else if (kind2 > rkind) {
10047 /* widen self and buf1 */
10048 rkind = kind2;
10049 sbuf = _PyUnicode_AsKind(self, rkind);
10050 if (!sbuf) goto error;
10051 srelease = 1;
10052 if (release1) PyMem_Free(buf1);
10053 buf1 = _PyUnicode_AsKind(str1, rkind);
10054 if (!buf1) goto error;
10055 release1 = 1;
10056 }
10057 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10058 PyUnicode_GET_LENGTH(str1))); */
10059 product = n * (len2-len1);
10060 if ((product / (len2-len1)) != n) {
10061 PyErr_SetString(PyExc_OverflowError,
10062 "replace string is too long");
10063 goto error;
10064 }
10065 new_size = slen + product;
10066 if (new_size == 0) {
10067 Py_INCREF(unicode_empty);
10068 u = unicode_empty;
10069 goto done;
10070 }
10071 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10072 PyErr_SetString(PyExc_OverflowError,
10073 "replace string is too long");
10074 goto error;
10075 }
10076 u = PyUnicode_New(new_size, maxchar);
10077 if (!u)
10078 goto error;
10079 assert(PyUnicode_KIND(u) == rkind);
10080 res = PyUnicode_DATA(u);
10081 ires = i = 0;
10082 if (len1 > 0) {
10083 while (n-- > 0) {
10084 /* look for next match */
10085 j = anylib_find(rkind, self,
10086 sbuf + rkind * i, slen-i,
10087 str1, buf1, len1, i);
10088 if (j == -1)
10089 break;
10090 else if (j > i) {
10091 /* copy unchanged part [i:j] */
10092 memcpy(res + rkind * ires,
10093 sbuf + rkind * i,
10094 rkind * (j-i));
10095 ires += j - i;
10096 }
10097 /* copy substitution string */
10098 if (len2 > 0) {
10099 memcpy(res + rkind * ires,
10100 buf2,
10101 rkind * len2);
10102 ires += len2;
10103 }
10104 i = j + len1;
10105 }
10106 if (i < slen)
10107 /* copy tail [i:] */
10108 memcpy(res + rkind * ires,
10109 sbuf + rkind * i,
10110 rkind * (slen-i));
10111 }
10112 else {
10113 /* interleave */
10114 while (n > 0) {
10115 memcpy(res + rkind * ires,
10116 buf2,
10117 rkind * len2);
10118 ires += len2;
10119 if (--n <= 0)
10120 break;
10121 memcpy(res + rkind * ires,
10122 sbuf + rkind * i,
10123 rkind);
10124 ires++;
10125 i++;
10126 }
10127 memcpy(res + rkind * ires,
10128 sbuf + rkind * i,
10129 rkind * (slen-i));
10130 }
10131 }
10132
10133 if (mayshrink) {
10134 unicode_adjust_maxchar(&u);
10135 if (u == NULL)
10136 goto error;
10137 }
10138
10139 done:
10140 if (srelease)
10141 PyMem_FREE(sbuf);
10142 if (release1)
10143 PyMem_FREE(buf1);
10144 if (release2)
10145 PyMem_FREE(buf2);
10146 assert(_PyUnicode_CheckConsistency(u, 1));
10147 return u;
10148
10149 nothing:
10150 /* nothing to replace; return original string (when possible) */
10151 if (srelease)
10152 PyMem_FREE(sbuf);
10153 if (release1)
10154 PyMem_FREE(buf1);
10155 if (release2)
10156 PyMem_FREE(buf2);
10157 if (PyUnicode_CheckExact(self)) {
10158 Py_INCREF(self);
10159 return self;
10160 }
10161 return PyUnicode_Copy(self);
10162 error:
10163 if (srelease && sbuf)
10164 PyMem_FREE(sbuf);
10165 if (release1 && buf1)
10166 PyMem_FREE(buf1);
10167 if (release2 && buf2)
10168 PyMem_FREE(buf2);
10169 return NULL;
10170 }
10171
10172 /* --- Unicode Object Methods --------------------------------------------- */
10173
10174 PyDoc_STRVAR(title__doc__,
10175 "S.title() -> str\n\
10176 \n\
10177 Return a titlecased version of S, i.e. words start with title case\n\
10178 characters, all remaining cased characters have lower case.");
10179
10180 static PyObject*
10181 unicode_title(PyObject *self)
10182 {
10183 return fixup(self, fixtitle);
10184 }
10185
10186 PyDoc_STRVAR(capitalize__doc__,
10187 "S.capitalize() -> str\n\
10188 \n\
10189 Return a capitalized version of S, i.e. make the first character\n\
10190 have upper case and the rest lower case.");
10191
10192 static PyObject*
10193 unicode_capitalize(PyObject *self)
10194 {
10195 return fixup(self, fixcapitalize);
10196 }
10197
10198 #if 0
10199 PyDoc_STRVAR(capwords__doc__,
10200 "S.capwords() -> str\n\
10201 \n\
10202 Apply .capitalize() to all words in S and return the result with\n\
10203 normalized whitespace (all whitespace strings are replaced by ' ').");
10204
10205 static PyObject*
10206 unicode_capwords(PyObject *self)
10207 {
10208 PyObject *list;
10209 PyObject *item;
10210 Py_ssize_t i;
10211
10212 /* Split into words */
10213 list = split(self, NULL, -1);
10214 if (!list)
10215 return NULL;
10216
10217 /* Capitalize each word */
10218 for (i = 0; i < PyList_GET_SIZE(list); i++) {
10219 item = fixup(PyList_GET_ITEM(list, i),
10220 fixcapitalize);
10221 if (item == NULL)
10222 goto onError;
10223 Py_DECREF(PyList_GET_ITEM(list, i));
10224 PyList_SET_ITEM(list, i, item);
10225 }
10226
10227 /* Join the words to form a new string */
10228 item = PyUnicode_Join(NULL, list);
10229
10230 onError:
10231 Py_DECREF(list);
10232 return item;
10233 }
10234 #endif
10235
10236 /* Argument converter. Coerces to a single unicode character */
10237
10238 static int
10239 convert_uc(PyObject *obj, void *addr)
10240 {
10241 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10242 PyObject *uniobj;
10243
10244 uniobj = PyUnicode_FromObject(obj);
10245 if (uniobj == NULL) {
10246 PyErr_SetString(PyExc_TypeError,
10247 "The fill character cannot be converted to Unicode");
10248 return 0;
10249 }
10250 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10251 PyErr_SetString(PyExc_TypeError,
10252 "The fill character must be exactly one character long");
10253 Py_DECREF(uniobj);
10254 return 0;
10255 }
10256 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10257 Py_DECREF(uniobj);
10258 return 1;
10259 }
10260
10261 PyDoc_STRVAR(center__doc__,
10262 "S.center(width[, fillchar]) -> str\n\
10263 \n\
10264 Return S centered in a string of length width. Padding is\n\
10265 done using the specified fill character (default is a space)");
10266
10267 static PyObject *
10268 unicode_center(PyObject *self, PyObject *args)
10269 {
10270 Py_ssize_t marg, left;
10271 Py_ssize_t width;
10272 Py_UCS4 fillchar = ' ';
10273
10274 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10275 return NULL;
10276
10277 if (PyUnicode_READY(self) == -1)
10278 return NULL;
10279
10280 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10281 Py_INCREF(self);
10282 return self;
10283 }
10284
10285 marg = width - _PyUnicode_LENGTH(self);
10286 left = marg / 2 + (marg & width & 1);
10287
10288 return pad(self, left, marg - left, fillchar);
10289 }
10290
10291 /* This function assumes that str1 and str2 are readied by the caller. */
10292
10293 static int
10294 unicode_compare(PyObject *str1, PyObject *str2)
10295 {
10296 int kind1, kind2;
10297 void *data1, *data2;
10298 Py_ssize_t len1, len2, i;
10299
10300 kind1 = PyUnicode_KIND(str1);
10301 kind2 = PyUnicode_KIND(str2);
10302 data1 = PyUnicode_DATA(str1);
10303 data2 = PyUnicode_DATA(str2);
10304 len1 = PyUnicode_GET_LENGTH(str1);
10305 len2 = PyUnicode_GET_LENGTH(str2);
10306
10307 for (i = 0; i < len1 && i < len2; ++i) {
10308 Py_UCS4 c1, c2;
10309 c1 = PyUnicode_READ(kind1, data1, i);
10310 c2 = PyUnicode_READ(kind2, data2, i);
10311
10312 if (c1 != c2)
10313 return (c1 < c2) ? -1 : 1;
10314 }
10315
10316 return (len1 < len2) ? -1 : (len1 != len2);
10317 }
10318
10319 int
10320 PyUnicode_Compare(PyObject *left, PyObject *right)
10321 {
10322 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10323 if (PyUnicode_READY(left) == -1 ||
10324 PyUnicode_READY(right) == -1)
10325 return -1;
10326 return unicode_compare(left, right);
10327 }
10328 PyErr_Format(PyExc_TypeError,
10329 "Can't compare %.100s and %.100s",
10330 left->ob_type->tp_name,
10331 right->ob_type->tp_name);
10332 return -1;
10333 }
10334
10335 int
10336 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10337 {
10338 Py_ssize_t i;
10339 int kind;
10340 void *data;
10341 Py_UCS4 chr;
10342
10343 assert(_PyUnicode_CHECK(uni));
10344 if (PyUnicode_READY(uni) == -1)
10345 return -1;
10346 kind = PyUnicode_KIND(uni);
10347 data = PyUnicode_DATA(uni);
10348 /* Compare Unicode string and source character set string */
10349 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10350 if (chr != str[i])
10351 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10352 /* This check keeps Python strings that end in '\0' from comparing equal
10353 to C strings identical up to that point. */
10354 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10355 return 1; /* uni is longer */
10356 if (str[i])
10357 return -1; /* str is longer */
10358 return 0;
10359 }
10360
10361
10362 #define TEST_COND(cond) \
10363 ((cond) ? Py_True : Py_False)
10364
10365 PyObject *
10366 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10367 {
10368 int result;
10369
10370 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10371 PyObject *v;
10372 if (PyUnicode_READY(left) == -1 ||
10373 PyUnicode_READY(right) == -1)
10374 return NULL;
10375 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10376 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10377 if (op == Py_EQ) {
10378 Py_INCREF(Py_False);
10379 return Py_False;
10380 }
10381 if (op == Py_NE) {
10382 Py_INCREF(Py_True);
10383 return Py_True;
10384 }
10385 }
10386 if (left == right)
10387 result = 0;
10388 else
10389 result = unicode_compare(left, right);
10390
10391 /* Convert the return value to a Boolean */
10392 switch (op) {
10393 case Py_EQ:
10394 v = TEST_COND(result == 0);
10395 break;
10396 case Py_NE:
10397 v = TEST_COND(result != 0);
10398 break;
10399 case Py_LE:
10400 v = TEST_COND(result <= 0);
10401 break;
10402 case Py_GE:
10403 v = TEST_COND(result >= 0);
10404 break;
10405 case Py_LT:
10406 v = TEST_COND(result == -1);
10407 break;
10408 case Py_GT:
10409 v = TEST_COND(result == 1);
10410 break;
10411 default:
10412 PyErr_BadArgument();
10413 return NULL;
10414 }
10415 Py_INCREF(v);
10416 return v;
10417 }
10418
10419 Py_RETURN_NOTIMPLEMENTED;
10420 }
10421
10422 int
10423 PyUnicode_Contains(PyObject *container, PyObject *element)
10424 {
10425 PyObject *str, *sub;
10426 int kind1, kind2, kind;
10427 void *buf1, *buf2;
10428 Py_ssize_t len1, len2;
10429 int result;
10430
10431 /* Coerce the two arguments */
10432 sub = PyUnicode_FromObject(element);
10433 if (!sub) {
10434 PyErr_Format(PyExc_TypeError,
10435 "'in <string>' requires string as left operand, not %s",
10436 element->ob_type->tp_name);
10437 return -1;
10438 }
10439 if (PyUnicode_READY(sub) == -1)
10440 return -1;
10441
10442 str = PyUnicode_FromObject(container);
10443 if (!str || PyUnicode_READY(str) == -1) {
10444 Py_DECREF(sub);
10445 return -1;
10446 }
10447
10448 kind1 = PyUnicode_KIND(str);
10449 kind2 = PyUnicode_KIND(sub);
10450 kind = kind1 > kind2 ? kind1 : kind2;
10451 buf1 = PyUnicode_DATA(str);
10452 buf2 = PyUnicode_DATA(sub);
10453 if (kind1 != kind)
10454 buf1 = _PyUnicode_AsKind(str, kind);
10455 if (!buf1) {
10456 Py_DECREF(sub);
10457 return -1;
10458 }
10459 if (kind2 != kind)
10460 buf2 = _PyUnicode_AsKind(sub, kind);
10461 if (!buf2) {
10462 Py_DECREF(sub);
10463 if (kind1 != kind) PyMem_Free(buf1);
10464 return -1;
10465 }
10466 len1 = PyUnicode_GET_LENGTH(str);
10467 len2 = PyUnicode_GET_LENGTH(sub);
10468
10469 switch(kind) {
10470 case PyUnicode_1BYTE_KIND:
10471 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10472 break;
10473 case PyUnicode_2BYTE_KIND:
10474 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10475 break;
10476 case PyUnicode_4BYTE_KIND:
10477 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10478 break;
10479 default:
10480 result = -1;
10481 assert(0);
10482 }
10483
10484 Py_DECREF(str);
10485 Py_DECREF(sub);
10486
10487 if (kind1 != kind)
10488 PyMem_Free(buf1);
10489 if (kind2 != kind)
10490 PyMem_Free(buf2);
10491
10492 return result;
10493 }
10494
10495 /* Concat to string or Unicode object giving a new Unicode object. */
10496
10497 PyObject *
10498 PyUnicode_Concat(PyObject *left, PyObject *right)
10499 {
10500 PyObject *u = NULL, *v = NULL, *w;
10501 Py_UCS4 maxchar, maxchar2;
10502
10503 /* Coerce the two arguments */
10504 u = PyUnicode_FromObject(left);
10505 if (u == NULL)
10506 goto onError;
10507 v = PyUnicode_FromObject(right);
10508 if (v == NULL)
10509 goto onError;
10510
10511 /* Shortcuts */
10512 if (v == unicode_empty) {
10513 Py_DECREF(v);
10514 return u;
10515 }
10516 if (u == unicode_empty) {
10517 Py_DECREF(u);
10518 return v;
10519 }
10520
10521 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10522 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10523 maxchar = Py_MAX(maxchar, maxchar2);
10524
10525 /* Concat the two Unicode strings */
10526 w = PyUnicode_New(
10527 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10528 maxchar);
10529 if (w == NULL)
10530 goto onError;
10531 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10532 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
10533 Py_DECREF(u);
10534 Py_DECREF(v);
10535 assert(_PyUnicode_CheckConsistency(w, 1));
10536 return w;
10537
10538 onError:
10539 Py_XDECREF(u);
10540 Py_XDECREF(v);
10541 return NULL;
10542 }
10543
10544 static void
10545 unicode_append_inplace(PyObject **p_left, PyObject *right)
10546 {
10547 Py_ssize_t left_len, right_len, new_len;
10548
10549 assert(PyUnicode_IS_READY(*p_left));
10550 assert(PyUnicode_IS_READY(right));
10551
10552 left_len = PyUnicode_GET_LENGTH(*p_left);
10553 right_len = PyUnicode_GET_LENGTH(right);
10554 if (left_len > PY_SSIZE_T_MAX - right_len) {
10555 PyErr_SetString(PyExc_OverflowError,
10556 "strings are too large to concat");
10557 goto error;
10558 }
10559 new_len = left_len + right_len;
10560
10561 /* Now we own the last reference to 'left', so we can resize it
10562 * in-place.
10563 */
10564 if (unicode_resize(p_left, new_len) != 0) {
10565 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10566 * deallocated so it cannot be put back into
10567 * 'variable'. The MemoryError is raised when there
10568 * is no value in 'variable', which might (very
10569 * remotely) be a cause of incompatibilities.
10570 */
10571 goto error;
10572 }
10573 /* copy 'right' into the newly allocated area of 'left' */
10574 copy_characters(*p_left, left_len, right, 0, right_len);
10575 _PyUnicode_DIRTY(*p_left);
10576 return;
10577
10578 error:
10579 Py_DECREF(*p_left);
10580 *p_left = NULL;
10581 }
10582
10583 void
10584 PyUnicode_Append(PyObject **p_left, PyObject *right)
10585 {
10586 PyObject *left, *res;
10587
10588 if (p_left == NULL) {
10589 if (!PyErr_Occurred())
10590 PyErr_BadInternalCall();
10591 return;
10592 }
10593 left = *p_left;
10594 if (right == NULL || !PyUnicode_Check(left)) {
10595 if (!PyErr_Occurred())
10596 PyErr_BadInternalCall();
10597 goto error;
10598 }
10599
10600 if (PyUnicode_READY(left))
10601 goto error;
10602 if (PyUnicode_READY(right))
10603 goto error;
10604
10605 if (PyUnicode_CheckExact(left) && left != unicode_empty
10606 && PyUnicode_CheckExact(right) && right != unicode_empty
10607 && unicode_resizable(left)
10608 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10609 || _PyUnicode_WSTR(left) != NULL))
10610 {
10611 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10612 to change the structure size, but characters are stored just after
10613 the structure, and so it requires to move all characters which is
10614 not so different than duplicating the string. */
10615 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10616 {
10617 unicode_append_inplace(p_left, right);
10618 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
10619 return;
10620 }
10621 }
10622
10623 res = PyUnicode_Concat(left, right);
10624 if (res == NULL)
10625 goto error;
10626 Py_DECREF(left);
10627 *p_left = res;
10628 return;
10629
10630 error:
10631 Py_DECREF(*p_left);
10632 *p_left = NULL;
10633 }
10634
10635 void
10636 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10637 {
10638 PyUnicode_Append(pleft, right);
10639 Py_XDECREF(right);
10640 }
10641
10642 PyDoc_STRVAR(count__doc__,
10643 "S.count(sub[, start[, end]]) -> int\n\
10644 \n\
10645 Return the number of non-overlapping occurrences of substring sub in\n\
10646 string S[start:end]. Optional arguments start and end are\n\
10647 interpreted as in slice notation.");
10648
10649 static PyObject *
10650 unicode_count(PyObject *self, PyObject *args)
10651 {
10652 PyObject *substring;
10653 Py_ssize_t start = 0;
10654 Py_ssize_t end = PY_SSIZE_T_MAX;
10655 PyObject *result;
10656 int kind1, kind2, kind;
10657 void *buf1, *buf2;
10658 Py_ssize_t len1, len2, iresult;
10659
10660 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10661 &start, &end))
10662 return NULL;
10663
10664 kind1 = PyUnicode_KIND(self);
10665 kind2 = PyUnicode_KIND(substring);
10666 kind = kind1 > kind2 ? kind1 : kind2;
10667 buf1 = PyUnicode_DATA(self);
10668 buf2 = PyUnicode_DATA(substring);
10669 if (kind1 != kind)
10670 buf1 = _PyUnicode_AsKind(self, kind);
10671 if (!buf1) {
10672 Py_DECREF(substring);
10673 return NULL;
10674 }
10675 if (kind2 != kind)
10676 buf2 = _PyUnicode_AsKind(substring, kind);
10677 if (!buf2) {
10678 Py_DECREF(substring);
10679 if (kind1 != kind) PyMem_Free(buf1);
10680 return NULL;
10681 }
10682 len1 = PyUnicode_GET_LENGTH(self);
10683 len2 = PyUnicode_GET_LENGTH(substring);
10684
10685 ADJUST_INDICES(start, end, len1);
10686 switch(kind) {
10687 case PyUnicode_1BYTE_KIND:
10688 iresult = ucs1lib_count(
10689 ((Py_UCS1*)buf1) + start, end - start,
10690 buf2, len2, PY_SSIZE_T_MAX
10691 );
10692 break;
10693 case PyUnicode_2BYTE_KIND:
10694 iresult = ucs2lib_count(
10695 ((Py_UCS2*)buf1) + start, end - start,
10696 buf2, len2, PY_SSIZE_T_MAX
10697 );
10698 break;
10699 case PyUnicode_4BYTE_KIND:
10700 iresult = ucs4lib_count(
10701 ((Py_UCS4*)buf1) + start, end - start,
10702 buf2, len2, PY_SSIZE_T_MAX
10703 );
10704 break;
10705 default:
10706 assert(0); iresult = 0;
10707 }
10708
10709 result = PyLong_FromSsize_t(iresult);
10710
10711 if (kind1 != kind)
10712 PyMem_Free(buf1);
10713 if (kind2 != kind)
10714 PyMem_Free(buf2);
10715
10716 Py_DECREF(substring);
10717
10718 return result;
10719 }
10720
10721 PyDoc_STRVAR(encode__doc__,
10722 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10723 \n\
10724 Encode S using the codec registered for encoding. Default encoding\n\
10725 is 'utf-8'. errors may be given to set a different error\n\
10726 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10727 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10728 'xmlcharrefreplace' as well as any other name registered with\n\
10729 codecs.register_error that can handle UnicodeEncodeErrors.");
10730
10731 static PyObject *
10732 unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10733 {
10734 static char *kwlist[] = {"encoding", "errors", 0};
10735 char *encoding = NULL;
10736 char *errors = NULL;
10737
10738 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10739 kwlist, &encoding, &errors))
10740 return NULL;
10741 return PyUnicode_AsEncodedString(self, encoding, errors);
10742 }
10743
10744 PyDoc_STRVAR(expandtabs__doc__,
10745 "S.expandtabs([tabsize]) -> str\n\
10746 \n\
10747 Return a copy of S where all tab characters are expanded using spaces.\n\
10748 If tabsize is not given, a tab size of 8 characters is assumed.");
10749
10750 static PyObject*
10751 unicode_expandtabs(PyObject *self, PyObject *args)
10752 {
10753 Py_ssize_t i, j, line_pos, src_len, incr;
10754 Py_UCS4 ch;
10755 PyObject *u;
10756 void *src_data, *dest_data;
10757 int tabsize = 8;
10758 int kind;
10759 int found;
10760
10761 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10762 return NULL;
10763
10764 if (PyUnicode_READY(self) == -1)
10765 return NULL;
10766
10767 /* First pass: determine size of output string */
10768 src_len = PyUnicode_GET_LENGTH(self);
10769 i = j = line_pos = 0;
10770 kind = PyUnicode_KIND(self);
10771 src_data = PyUnicode_DATA(self);
10772 found = 0;
10773 for (; i < src_len; i++) {
10774 ch = PyUnicode_READ(kind, src_data, i);
10775 if (ch == '\t') {
10776 found = 1;
10777 if (tabsize > 0) {
10778 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10779 if (j > PY_SSIZE_T_MAX - incr)
10780 goto overflow;
10781 line_pos += incr;
10782 j += incr;
10783 }
10784 }
10785 else {
10786 if (j > PY_SSIZE_T_MAX - 1)
10787 goto overflow;
10788 line_pos++;
10789 j++;
10790 if (ch == '\n' || ch == '\r')
10791 line_pos = 0;
10792 }
10793 }
10794 if (!found && PyUnicode_CheckExact(self)) {
10795 Py_INCREF(self);
10796 return self;
10797 }
10798
10799 /* Second pass: create output string and fill it */
10800 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10801 if (!u)
10802 return NULL;
10803 dest_data = PyUnicode_DATA(u);
10804
10805 i = j = line_pos = 0;
10806
10807 for (; i < src_len; i++) {
10808 ch = PyUnicode_READ(kind, src_data, i);
10809 if (ch == '\t') {
10810 if (tabsize > 0) {
10811 incr = tabsize - (line_pos % tabsize);
10812 line_pos += incr;
10813 while (incr--) {
10814 PyUnicode_WRITE(kind, dest_data, j, ' ');
10815 j++;
10816 }
10817 }
10818 }
10819 else {
10820 line_pos++;
10821 PyUnicode_WRITE(kind, dest_data, j, ch);
10822 j++;
10823 if (ch == '\n' || ch == '\r')
10824 line_pos = 0;
10825 }
10826 }
10827 assert (j == PyUnicode_GET_LENGTH(u));
10828 return unicode_result(u);
10829
10830 overflow:
10831 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10832 return NULL;
10833 }
10834
10835 PyDoc_STRVAR(find__doc__,
10836 "S.find(sub[, start[, end]]) -> int\n\
10837 \n\
10838 Return the lowest index in S where substring sub is found,\n\
10839 such that sub is contained within S[start:end]. Optional\n\
10840 arguments start and end are interpreted as in slice notation.\n\
10841 \n\
10842 Return -1 on failure.");
10843
10844 static PyObject *
10845 unicode_find(PyObject *self, PyObject *args)
10846 {
10847 PyObject *substring;
10848 Py_ssize_t start;
10849 Py_ssize_t end;
10850 Py_ssize_t result;
10851
10852 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10853 &start, &end))
10854 return NULL;
10855
10856 if (PyUnicode_READY(self) == -1)
10857 return NULL;
10858 if (PyUnicode_READY(substring) == -1)
10859 return NULL;
10860
10861 result = any_find_slice(1, self, substring, start, end);
10862
10863 Py_DECREF(substring);
10864
10865 if (result == -2)
10866 return NULL;
10867
10868 return PyLong_FromSsize_t(result);
10869 }
10870
10871 static PyObject *
10872 unicode_getitem(PyObject *self, Py_ssize_t index)
10873 {
10874 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10875 if (ch == (Py_UCS4)-1)
10876 return NULL;
10877 return PyUnicode_FromOrdinal(ch);
10878 }
10879
10880 /* Believe it or not, this produces the same value for ASCII strings
10881 as bytes_hash(). */
10882 static Py_hash_t
10883 unicode_hash(PyObject *self)
10884 {
10885 Py_ssize_t len;
10886 Py_uhash_t x;
10887
10888 if (_PyUnicode_HASH(self) != -1)
10889 return _PyUnicode_HASH(self);
10890 if (PyUnicode_READY(self) == -1)
10891 return -1;
10892 len = PyUnicode_GET_LENGTH(self);
10893
10894 /* The hash function as a macro, gets expanded three times below. */
10895 #define HASH(P) \
10896 x = (Py_uhash_t)*P << 7; \
10897 while (--len >= 0) \
10898 x = (1000003*x) ^ (Py_uhash_t)*P++;
10899
10900 switch (PyUnicode_KIND(self)) {
10901 case PyUnicode_1BYTE_KIND: {
10902 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10903 HASH(c);
10904 break;
10905 }
10906 case PyUnicode_2BYTE_KIND: {
10907 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10908 HASH(s);
10909 break;
10910 }
10911 default: {
10912 Py_UCS4 *l;
10913 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10914 "Impossible switch case in unicode_hash");
10915 l = PyUnicode_4BYTE_DATA(self);
10916 HASH(l);
10917 break;
10918 }
10919 }
10920 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10921
10922 if (x == -1)
10923 x = -2;
10924 _PyUnicode_HASH(self) = x;
10925 return x;
10926 }
10927 #undef HASH
10928
10929 PyDoc_STRVAR(index__doc__,
10930 "S.index(sub[, start[, end]]) -> int\n\
10931 \n\
10932 Like S.find() but raise ValueError when the substring is not found.");
10933
10934 static PyObject *
10935 unicode_index(PyObject *self, PyObject *args)
10936 {
10937 Py_ssize_t result;
10938 PyObject *substring;
10939 Py_ssize_t start;
10940 Py_ssize_t end;
10941
10942 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10943 &start, &end))
10944 return NULL;
10945
10946 if (PyUnicode_READY(self) == -1)
10947 return NULL;
10948 if (PyUnicode_READY(substring) == -1)
10949 return NULL;
10950
10951 result = any_find_slice(1, self, substring, start, end);
10952
10953 Py_DECREF(substring);
10954
10955 if (result == -2)
10956 return NULL;
10957
10958 if (result < 0) {
10959 PyErr_SetString(PyExc_ValueError, "substring not found");
10960 return NULL;
10961 }
10962
10963 return PyLong_FromSsize_t(result);
10964 }
10965
10966 PyDoc_STRVAR(islower__doc__,
10967 "S.islower() -> bool\n\
10968 \n\
10969 Return True if all cased characters in S are lowercase and there is\n\
10970 at least one cased character in S, False otherwise.");
10971
10972 static PyObject*
10973 unicode_islower(PyObject *self)
10974 {
10975 Py_ssize_t i, length;
10976 int kind;
10977 void *data;
10978 int cased;
10979
10980 if (PyUnicode_READY(self) == -1)
10981 return NULL;
10982 length = PyUnicode_GET_LENGTH(self);
10983 kind = PyUnicode_KIND(self);
10984 data = PyUnicode_DATA(self);
10985
10986 /* Shortcut for single character strings */
10987 if (length == 1)
10988 return PyBool_FromLong(
10989 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
10990
10991 /* Special case for empty strings */
10992 if (length == 0)
10993 return PyBool_FromLong(0);
10994
10995 cased = 0;
10996 for (i = 0; i < length; i++) {
10997 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10998
10999 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11000 return PyBool_FromLong(0);
11001 else if (!cased && Py_UNICODE_ISLOWER(ch))
11002 cased = 1;
11003 }
11004 return PyBool_FromLong(cased);
11005 }
11006
11007 PyDoc_STRVAR(isupper__doc__,
11008 "S.isupper() -> bool\n\
11009 \n\
11010 Return True if all cased characters in S are uppercase and there is\n\
11011 at least one cased character in S, False otherwise.");
11012
11013 static PyObject*
11014 unicode_isupper(PyObject *self)
11015 {
11016 Py_ssize_t i, length;
11017 int kind;
11018 void *data;
11019 int cased;
11020
11021 if (PyUnicode_READY(self) == -1)
11022 return NULL;
11023 length = PyUnicode_GET_LENGTH(self);
11024 kind = PyUnicode_KIND(self);
11025 data = PyUnicode_DATA(self);
11026
11027 /* Shortcut for single character strings */
11028 if (length == 1)
11029 return PyBool_FromLong(
11030 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11031
11032 /* Special case for empty strings */
11033 if (length == 0)
11034 return PyBool_FromLong(0);
11035
11036 cased = 0;
11037 for (i = 0; i < length; i++) {
11038 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11039
11040 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11041 return PyBool_FromLong(0);
11042 else if (!cased && Py_UNICODE_ISUPPER(ch))
11043 cased = 1;
11044 }
11045 return PyBool_FromLong(cased);
11046 }
11047
11048 PyDoc_STRVAR(istitle__doc__,
11049 "S.istitle() -> bool\n\
11050 \n\
11051 Return True if S is a titlecased string and there is at least one\n\
11052 character in S, i.e. upper- and titlecase characters may only\n\
11053 follow uncased characters and lowercase characters only cased ones.\n\
11054 Return False otherwise.");
11055
11056 static PyObject*
11057 unicode_istitle(PyObject *self)
11058 {
11059 Py_ssize_t i, length;
11060 int kind;
11061 void *data;
11062 int cased, previous_is_cased;
11063
11064 if (PyUnicode_READY(self) == -1)
11065 return NULL;
11066 length = PyUnicode_GET_LENGTH(self);
11067 kind = PyUnicode_KIND(self);
11068 data = PyUnicode_DATA(self);
11069
11070 /* Shortcut for single character strings */
11071 if (length == 1) {
11072 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11073 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11074 (Py_UNICODE_ISUPPER(ch) != 0));
11075 }
11076
11077 /* Special case for empty strings */
11078 if (length == 0)
11079 return PyBool_FromLong(0);
11080
11081 cased = 0;
11082 previous_is_cased = 0;
11083 for (i = 0; i < length; i++) {
11084 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11085
11086 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11087 if (previous_is_cased)
11088 return PyBool_FromLong(0);
11089 previous_is_cased = 1;
11090 cased = 1;
11091 }
11092 else if (Py_UNICODE_ISLOWER(ch)) {
11093 if (!previous_is_cased)
11094 return PyBool_FromLong(0);
11095 previous_is_cased = 1;
11096 cased = 1;
11097 }
11098 else
11099 previous_is_cased = 0;
11100 }
11101 return PyBool_FromLong(cased);
11102 }
11103
11104 PyDoc_STRVAR(isspace__doc__,
11105 "S.isspace() -> bool\n\
11106 \n\
11107 Return True if all characters in S are whitespace\n\
11108 and there is at least one character in S, False otherwise.");
11109
11110 static PyObject*
11111 unicode_isspace(PyObject *self)
11112 {
11113 Py_ssize_t i, length;
11114 int kind;
11115 void *data;
11116
11117 if (PyUnicode_READY(self) == -1)
11118 return NULL;
11119 length = PyUnicode_GET_LENGTH(self);
11120 kind = PyUnicode_KIND(self);
11121 data = PyUnicode_DATA(self);
11122
11123 /* Shortcut for single character strings */
11124 if (length == 1)
11125 return PyBool_FromLong(
11126 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11127
11128 /* Special case for empty strings */
11129 if (length == 0)
11130 return PyBool_FromLong(0);
11131
11132 for (i = 0; i < length; i++) {
11133 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11134 if (!Py_UNICODE_ISSPACE(ch))
11135 return PyBool_FromLong(0);
11136 }
11137 return PyBool_FromLong(1);
11138 }
11139
11140 PyDoc_STRVAR(isalpha__doc__,
11141 "S.isalpha() -> bool\n\
11142 \n\
11143 Return True if all characters in S are alphabetic\n\
11144 and there is at least one character in S, False otherwise.");
11145
11146 static PyObject*
11147 unicode_isalpha(PyObject *self)
11148 {
11149 Py_ssize_t i, length;
11150 int kind;
11151 void *data;
11152
11153 if (PyUnicode_READY(self) == -1)
11154 return NULL;
11155 length = PyUnicode_GET_LENGTH(self);
11156 kind = PyUnicode_KIND(self);
11157 data = PyUnicode_DATA(self);
11158
11159 /* Shortcut for single character strings */
11160 if (length == 1)
11161 return PyBool_FromLong(
11162 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11163
11164 /* Special case for empty strings */
11165 if (length == 0)
11166 return PyBool_FromLong(0);
11167
11168 for (i = 0; i < length; i++) {
11169 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11170 return PyBool_FromLong(0);
11171 }
11172 return PyBool_FromLong(1);
11173 }
11174
11175 PyDoc_STRVAR(isalnum__doc__,
11176 "S.isalnum() -> bool\n\
11177 \n\
11178 Return True if all characters in S are alphanumeric\n\
11179 and there is at least one character in S, False otherwise.");
11180
11181 static PyObject*
11182 unicode_isalnum(PyObject *self)
11183 {
11184 int kind;
11185 void *data;
11186 Py_ssize_t len, i;
11187
11188 if (PyUnicode_READY(self) == -1)
11189 return NULL;
11190
11191 kind = PyUnicode_KIND(self);
11192 data = PyUnicode_DATA(self);
11193 len = PyUnicode_GET_LENGTH(self);
11194
11195 /* Shortcut for single character strings */
11196 if (len == 1) {
11197 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11198 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11199 }
11200
11201 /* Special case for empty strings */
11202 if (len == 0)
11203 return PyBool_FromLong(0);
11204
11205 for (i = 0; i < len; i++) {
11206 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11207 if (!Py_UNICODE_ISALNUM(ch))
11208 return PyBool_FromLong(0);
11209 }
11210 return PyBool_FromLong(1);
11211 }
11212
11213 PyDoc_STRVAR(isdecimal__doc__,
11214 "S.isdecimal() -> bool\n\
11215 \n\
11216 Return True if there are only decimal characters in S,\n\
11217 False otherwise.");
11218
11219 static PyObject*
11220 unicode_isdecimal(PyObject *self)
11221 {
11222 Py_ssize_t i, length;
11223 int kind;
11224 void *data;
11225
11226 if (PyUnicode_READY(self) == -1)
11227 return NULL;
11228 length = PyUnicode_GET_LENGTH(self);
11229 kind = PyUnicode_KIND(self);
11230 data = PyUnicode_DATA(self);
11231
11232 /* Shortcut for single character strings */
11233 if (length == 1)
11234 return PyBool_FromLong(
11235 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11236
11237 /* Special case for empty strings */
11238 if (length == 0)
11239 return PyBool_FromLong(0);
11240
11241 for (i = 0; i < length; i++) {
11242 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11243 return PyBool_FromLong(0);
11244 }
11245 return PyBool_FromLong(1);
11246 }
11247
11248 PyDoc_STRVAR(isdigit__doc__,
11249 "S.isdigit() -> bool\n\
11250 \n\
11251 Return True if all characters in S are digits\n\
11252 and there is at least one character in S, False otherwise.");
11253
11254 static PyObject*
11255 unicode_isdigit(PyObject *self)
11256 {
11257 Py_ssize_t i, length;
11258 int kind;
11259 void *data;
11260
11261 if (PyUnicode_READY(self) == -1)
11262 return NULL;
11263 length = PyUnicode_GET_LENGTH(self);
11264 kind = PyUnicode_KIND(self);
11265 data = PyUnicode_DATA(self);
11266
11267 /* Shortcut for single character strings */
11268 if (length == 1) {
11269 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11270 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11271 }
11272
11273 /* Special case for empty strings */
11274 if (length == 0)
11275 return PyBool_FromLong(0);
11276
11277 for (i = 0; i < length; i++) {
11278 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11279 return PyBool_FromLong(0);
11280 }
11281 return PyBool_FromLong(1);
11282 }
11283
11284 PyDoc_STRVAR(isnumeric__doc__,
11285 "S.isnumeric() -> bool\n\
11286 \n\
11287 Return True if there are only numeric characters in S,\n\
11288 False otherwise.");
11289
11290 static PyObject*
11291 unicode_isnumeric(PyObject *self)
11292 {
11293 Py_ssize_t i, length;
11294 int kind;
11295 void *data;
11296
11297 if (PyUnicode_READY(self) == -1)
11298 return NULL;
11299 length = PyUnicode_GET_LENGTH(self);
11300 kind = PyUnicode_KIND(self);
11301 data = PyUnicode_DATA(self);
11302
11303 /* Shortcut for single character strings */
11304 if (length == 1)
11305 return PyBool_FromLong(
11306 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11307
11308 /* Special case for empty strings */
11309 if (length == 0)
11310 return PyBool_FromLong(0);
11311
11312 for (i = 0; i < length; i++) {
11313 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11314 return PyBool_FromLong(0);
11315 }
11316 return PyBool_FromLong(1);
11317 }
11318
11319 int
11320 PyUnicode_IsIdentifier(PyObject *self)
11321 {
11322 int kind;
11323 void *data;
11324 Py_ssize_t i;
11325 Py_UCS4 first;
11326
11327 if (PyUnicode_READY(self) == -1) {
11328 Py_FatalError("identifier not ready");
11329 return 0;
11330 }
11331
11332 /* Special case for empty strings */
11333 if (PyUnicode_GET_LENGTH(self) == 0)
11334 return 0;
11335 kind = PyUnicode_KIND(self);
11336 data = PyUnicode_DATA(self);
11337
11338 /* PEP 3131 says that the first character must be in
11339 XID_Start and subsequent characters in XID_Continue,
11340 and for the ASCII range, the 2.x rules apply (i.e
11341 start with letters and underscore, continue with
11342 letters, digits, underscore). However, given the current
11343 definition of XID_Start and XID_Continue, it is sufficient
11344 to check just for these, except that _ must be allowed
11345 as starting an identifier. */
11346 first = PyUnicode_READ(kind, data, 0);
11347 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11348 return 0;
11349
11350 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11351 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11352 return 0;
11353 return 1;
11354 }
11355
11356 PyDoc_STRVAR(isidentifier__doc__,
11357 "S.isidentifier() -> bool\n\
11358 \n\
11359 Return True if S is a valid identifier according\n\
11360 to the language definition.");
11361
11362 static PyObject*
11363 unicode_isidentifier(PyObject *self)
11364 {
11365 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11366 }
11367
11368 PyDoc_STRVAR(isprintable__doc__,
11369 "S.isprintable() -> bool\n\
11370 \n\
11371 Return True if all characters in S are considered\n\
11372 printable in repr() or S is empty, False otherwise.");
11373
11374 static PyObject*
11375 unicode_isprintable(PyObject *self)
11376 {
11377 Py_ssize_t i, length;
11378 int kind;
11379 void *data;
11380
11381 if (PyUnicode_READY(self) == -1)
11382 return NULL;
11383 length = PyUnicode_GET_LENGTH(self);
11384 kind = PyUnicode_KIND(self);
11385 data = PyUnicode_DATA(self);
11386
11387 /* Shortcut for single character strings */
11388 if (length == 1)
11389 return PyBool_FromLong(
11390 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11391
11392 for (i = 0; i < length; i++) {
11393 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11394 Py_RETURN_FALSE;
11395 }
11396 }
11397 Py_RETURN_TRUE;
11398 }
11399
11400 PyDoc_STRVAR(join__doc__,
11401 "S.join(iterable) -> str\n\
11402 \n\
11403 Return a string which is the concatenation of the strings in the\n\
11404 iterable. The separator between elements is S.");
11405
11406 static PyObject*
11407 unicode_join(PyObject *self, PyObject *data)
11408 {
11409 return PyUnicode_Join(self, data);
11410 }
11411
11412 static Py_ssize_t
11413 unicode_length(PyObject *self)
11414 {
11415 if (PyUnicode_READY(self) == -1)
11416 return -1;
11417 return PyUnicode_GET_LENGTH(self);
11418 }
11419
11420 PyDoc_STRVAR(ljust__doc__,
11421 "S.ljust(width[, fillchar]) -> str\n\
11422 \n\
11423 Return S left-justified in a Unicode string of length width. Padding is\n\
11424 done using the specified fill character (default is a space).");
11425
11426 static PyObject *
11427 unicode_ljust(PyObject *self, PyObject *args)
11428 {
11429 Py_ssize_t width;
11430 Py_UCS4 fillchar = ' ';
11431
11432 if (PyUnicode_READY(self) == -1)
11433 return NULL;
11434
11435 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11436 return NULL;
11437
11438 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11439 Py_INCREF(self);
11440 return self;
11441 }
11442
11443 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
11444 }
11445
11446 PyDoc_STRVAR(lower__doc__,
11447 "S.lower() -> str\n\
11448 \n\
11449 Return a copy of the string S converted to lowercase.");
11450
11451 static PyObject*
11452 unicode_lower(PyObject *self)
11453 {
11454 return fixup(self, fixlower);
11455 }
11456
11457 #define LEFTSTRIP 0
11458 #define RIGHTSTRIP 1
11459 #define BOTHSTRIP 2
11460
11461 /* Arrays indexed by above */
11462 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11463
11464 #define STRIPNAME(i) (stripformat[i]+3)
11465
11466 /* externally visible for str.strip(unicode) */
11467 PyObject *
11468 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11469 {
11470 void *data;
11471 int kind;
11472 Py_ssize_t i, j, len;
11473 BLOOM_MASK sepmask;
11474
11475 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11476 return NULL;
11477
11478 kind = PyUnicode_KIND(self);
11479 data = PyUnicode_DATA(self);
11480 len = PyUnicode_GET_LENGTH(self);
11481 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11482 PyUnicode_DATA(sepobj),
11483 PyUnicode_GET_LENGTH(sepobj));
11484
11485 i = 0;
11486 if (striptype != RIGHTSTRIP) {
11487 while (i < len &&
11488 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11489 i++;
11490 }
11491 }
11492
11493 j = len;
11494 if (striptype != LEFTSTRIP) {
11495 do {
11496 j--;
11497 } while (j >= i &&
11498 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11499 j++;
11500 }
11501
11502 return PyUnicode_Substring(self, i, j);
11503 }
11504
11505 PyObject*
11506 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11507 {
11508 unsigned char *data;
11509 int kind;
11510 Py_ssize_t length;
11511
11512 if (PyUnicode_READY(self) == -1)
11513 return NULL;
11514
11515 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11516
11517 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
11518 {
11519 if (PyUnicode_CheckExact(self)) {
11520 Py_INCREF(self);
11521 return self;
11522 }
11523 else
11524 return PyUnicode_Copy(self);
11525 }
11526
11527 length = end - start;
11528 if (length == 1)
11529 return unicode_getitem(self, start);
11530
11531 if (start < 0 || end < 0) {
11532 PyErr_SetString(PyExc_IndexError, "string index out of range");
11533 return NULL;
11534 }
11535
11536 if (PyUnicode_IS_ASCII(self)) {
11537 kind = PyUnicode_KIND(self);
11538 data = PyUnicode_1BYTE_DATA(self);
11539 return unicode_fromascii(data + start, length);
11540 }
11541 else {
11542 kind = PyUnicode_KIND(self);
11543 data = PyUnicode_1BYTE_DATA(self);
11544 return PyUnicode_FromKindAndData(kind,
11545 data + kind * start,
11546 length);
11547 }
11548 }
11549
11550 static PyObject *
11551 do_strip(PyObject *self, int striptype)
11552 {
11553 int kind;
11554 void *data;
11555 Py_ssize_t len, i, j;
11556
11557 if (PyUnicode_READY(self) == -1)
11558 return NULL;
11559
11560 kind = PyUnicode_KIND(self);
11561 data = PyUnicode_DATA(self);
11562 len = PyUnicode_GET_LENGTH(self);
11563
11564 i = 0;
11565 if (striptype != RIGHTSTRIP) {
11566 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11567 i++;
11568 }
11569 }
11570
11571 j = len;
11572 if (striptype != LEFTSTRIP) {
11573 do {
11574 j--;
11575 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11576 j++;
11577 }
11578
11579 return PyUnicode_Substring(self, i, j);
11580 }
11581
11582
11583 static PyObject *
11584 do_argstrip(PyObject *self, int striptype, PyObject *args)
11585 {
11586 PyObject *sep = NULL;
11587
11588 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11589 return NULL;
11590
11591 if (sep != NULL && sep != Py_None) {
11592 if (PyUnicode_Check(sep))
11593 return _PyUnicode_XStrip(self, striptype, sep);
11594 else {
11595 PyErr_Format(PyExc_TypeError,
11596 "%s arg must be None or str",
11597 STRIPNAME(striptype));
11598 return NULL;
11599 }
11600 }
11601
11602 return do_strip(self, striptype);
11603 }
11604
11605
11606 PyDoc_STRVAR(strip__doc__,
11607 "S.strip([chars]) -> str\n\
11608 \n\
11609 Return a copy of the string S with leading and trailing\n\
11610 whitespace removed.\n\
11611 If chars is given and not None, remove characters in chars instead.");
11612
11613 static PyObject *
11614 unicode_strip(PyObject *self, PyObject *args)
11615 {
11616 if (PyTuple_GET_SIZE(args) == 0)
11617 return do_strip(self, BOTHSTRIP); /* Common case */
11618 else
11619 return do_argstrip(self, BOTHSTRIP, args);
11620 }
11621
11622
11623 PyDoc_STRVAR(lstrip__doc__,
11624 "S.lstrip([chars]) -> str\n\
11625 \n\
11626 Return a copy of the string S with leading whitespace removed.\n\
11627 If chars is given and not None, remove characters in chars instead.");
11628
11629 static PyObject *
11630 unicode_lstrip(PyObject *self, PyObject *args)
11631 {
11632 if (PyTuple_GET_SIZE(args) == 0)
11633 return do_strip(self, LEFTSTRIP); /* Common case */
11634 else
11635 return do_argstrip(self, LEFTSTRIP, args);
11636 }
11637
11638
11639 PyDoc_STRVAR(rstrip__doc__,
11640 "S.rstrip([chars]) -> str\n\
11641 \n\
11642 Return a copy of the string S with trailing whitespace removed.\n\
11643 If chars is given and not None, remove characters in chars instead.");
11644
11645 static PyObject *
11646 unicode_rstrip(PyObject *self, PyObject *args)
11647 {
11648 if (PyTuple_GET_SIZE(args) == 0)
11649 return do_strip(self, RIGHTSTRIP); /* Common case */
11650 else
11651 return do_argstrip(self, RIGHTSTRIP, args);
11652 }
11653
11654
11655 static PyObject*
11656 unicode_repeat(PyObject *str, Py_ssize_t len)
11657 {
11658 PyObject *u;
11659 Py_ssize_t nchars, n;
11660
11661 if (len < 1) {
11662 Py_INCREF(unicode_empty);
11663 return unicode_empty;
11664 }
11665
11666 if (len == 1 && PyUnicode_CheckExact(str)) {
11667 /* no repeat, return original string */
11668 Py_INCREF(str);
11669 return str;
11670 }
11671
11672 if (PyUnicode_READY(str) == -1)
11673 return NULL;
11674
11675 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11676 PyErr_SetString(PyExc_OverflowError,
11677 "repeated string is too long");
11678 return NULL;
11679 }
11680 nchars = len * PyUnicode_GET_LENGTH(str);
11681
11682 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11683 if (!u)
11684 return NULL;
11685 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11686
11687 if (PyUnicode_GET_LENGTH(str) == 1) {
11688 const int kind = PyUnicode_KIND(str);
11689 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11690 void *to = PyUnicode_DATA(u);
11691 if (kind == PyUnicode_1BYTE_KIND)
11692 memset(to, (unsigned char)fill_char, len);
11693 else {
11694 for (n = 0; n < len; ++n)
11695 PyUnicode_WRITE(kind, to, n, fill_char);
11696 }
11697 }
11698 else {
11699 /* number of characters copied this far */
11700 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11701 const Py_ssize_t char_size = PyUnicode_KIND(str);
11702 char *to = (char *) PyUnicode_DATA(u);
11703 Py_MEMCPY(to, PyUnicode_DATA(str),
11704 PyUnicode_GET_LENGTH(str) * char_size);
11705 while (done < nchars) {
11706 n = (done <= nchars-done) ? done : nchars-done;
11707 Py_MEMCPY(to + (done * char_size), to, n * char_size);
11708 done += n;
11709 }
11710 }
11711
11712 assert(_PyUnicode_CheckConsistency(u, 1));
11713 return u;
11714 }
11715
11716 PyObject *
11717 PyUnicode_Replace(PyObject *obj,
11718 PyObject *subobj,
11719 PyObject *replobj,
11720 Py_ssize_t maxcount)
11721 {
11722 PyObject *self;
11723 PyObject *str1;
11724 PyObject *str2;
11725 PyObject *result;
11726
11727 self = PyUnicode_FromObject(obj);
11728 if (self == NULL || PyUnicode_READY(self) == -1)
11729 return NULL;
11730 str1 = PyUnicode_FromObject(subobj);
11731 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11732 Py_DECREF(self);
11733 return NULL;
11734 }
11735 str2 = PyUnicode_FromObject(replobj);
11736 if (str2 == NULL || PyUnicode_READY(str2)) {
11737 Py_DECREF(self);
11738 Py_DECREF(str1);
11739 return NULL;
11740 }
11741 result = replace(self, str1, str2, maxcount);
11742 Py_DECREF(self);
11743 Py_DECREF(str1);
11744 Py_DECREF(str2);
11745 return result;
11746 }
11747
11748 PyDoc_STRVAR(replace__doc__,
11749 "S.replace(old, new[, count]) -> str\n\
11750 \n\
11751 Return a copy of S with all occurrences of substring\n\
11752 old replaced by new. If the optional argument count is\n\
11753 given, only the first count occurrences are replaced.");
11754
11755 static PyObject*
11756 unicode_replace(PyObject *self, PyObject *args)
11757 {
11758 PyObject *str1;
11759 PyObject *str2;
11760 Py_ssize_t maxcount = -1;
11761 PyObject *result;
11762
11763 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11764 return NULL;
11765 if (!PyUnicode_READY(self) == -1)
11766 return NULL;
11767 str1 = PyUnicode_FromObject(str1);
11768 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11769 return NULL;
11770 str2 = PyUnicode_FromObject(str2);
11771 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11772 Py_DECREF(str1);
11773 return NULL;
11774 }
11775
11776 result = replace(self, str1, str2, maxcount);
11777
11778 Py_DECREF(str1);
11779 Py_DECREF(str2);
11780 return result;
11781 }
11782
11783 static PyObject *
11784 unicode_repr(PyObject *unicode)
11785 {
11786 PyObject *repr;
11787 Py_ssize_t isize;
11788 Py_ssize_t osize, squote, dquote, i, o;
11789 Py_UCS4 max, quote;
11790 int ikind, okind;
11791 void *idata, *odata;
11792
11793 if (PyUnicode_READY(unicode) == -1)
11794 return NULL;
11795
11796 isize = PyUnicode_GET_LENGTH(unicode);
11797 idata = PyUnicode_DATA(unicode);
11798
11799 /* Compute length of output, quote characters, and
11800 maximum character */
11801 osize = 2; /* quotes */
11802 max = 127;
11803 squote = dquote = 0;
11804 ikind = PyUnicode_KIND(unicode);
11805 for (i = 0; i < isize; i++) {
11806 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11807 switch (ch) {
11808 case '\'': squote++; osize++; break;
11809 case '"': dquote++; osize++; break;
11810 case '\\': case '\t': case '\r': case '\n':
11811 osize += 2; break;
11812 default:
11813 /* Fast-path ASCII */
11814 if (ch < ' ' || ch == 0x7f)
11815 osize += 4; /* \xHH */
11816 else if (ch < 0x7f)
11817 osize++;
11818 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11819 osize++;
11820 max = ch > max ? ch : max;
11821 }
11822 else if (ch < 0x100)
11823 osize += 4; /* \xHH */
11824 else if (ch < 0x10000)
11825 osize += 6; /* \uHHHH */
11826 else
11827 osize += 10; /* \uHHHHHHHH */
11828 }
11829 }
11830
11831 quote = '\'';
11832 if (squote) {
11833 if (dquote)
11834 /* Both squote and dquote present. Use squote,
11835 and escape them */
11836 osize += squote;
11837 else
11838 quote = '"';
11839 }
11840
11841 repr = PyUnicode_New(osize, max);
11842 if (repr == NULL)
11843 return NULL;
11844 okind = PyUnicode_KIND(repr);
11845 odata = PyUnicode_DATA(repr);
11846
11847 PyUnicode_WRITE(okind, odata, 0, quote);
11848 PyUnicode_WRITE(okind, odata, osize-1, quote);
11849
11850 for (i = 0, o = 1; i < isize; i++) {
11851 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11852
11853 /* Escape quotes and backslashes */
11854 if ((ch == quote) || (ch == '\\')) {
11855 PyUnicode_WRITE(okind, odata, o++, '\\');
11856 PyUnicode_WRITE(okind, odata, o++, ch);
11857 continue;
11858 }
11859
11860 /* Map special whitespace to '\t', \n', '\r' */
11861 if (ch == '\t') {
11862 PyUnicode_WRITE(okind, odata, o++, '\\');
11863 PyUnicode_WRITE(okind, odata, o++, 't');
11864 }
11865 else if (ch == '\n') {
11866 PyUnicode_WRITE(okind, odata, o++, '\\');
11867 PyUnicode_WRITE(okind, odata, o++, 'n');
11868 }
11869 else if (ch == '\r') {
11870 PyUnicode_WRITE(okind, odata, o++, '\\');
11871 PyUnicode_WRITE(okind, odata, o++, 'r');
11872 }
11873
11874 /* Map non-printable US ASCII to '\xhh' */
11875 else if (ch < ' ' || ch == 0x7F) {
11876 PyUnicode_WRITE(okind, odata, o++, '\\');
11877 PyUnicode_WRITE(okind, odata, o++, 'x');
11878 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11879 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
11880 }
11881
11882 /* Copy ASCII characters as-is */
11883 else if (ch < 0x7F) {
11884 PyUnicode_WRITE(okind, odata, o++, ch);
11885 }
11886
11887 /* Non-ASCII characters */
11888 else {
11889 /* Map Unicode whitespace and control characters
11890 (categories Z* and C* except ASCII space)
11891 */
11892 if (!Py_UNICODE_ISPRINTABLE(ch)) {
11893 /* Map 8-bit characters to '\xhh' */
11894 if (ch <= 0xff) {
11895 PyUnicode_WRITE(okind, odata, o++, '\\');
11896 PyUnicode_WRITE(okind, odata, o++, 'x');
11897 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11898 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
11899 }
11900 /* Map 21-bit characters to '\U00xxxxxx' */
11901 else if (ch >= 0x10000) {
11902 PyUnicode_WRITE(okind, odata, o++, '\\');
11903 PyUnicode_WRITE(okind, odata, o++, 'U');
11904 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11905 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11906 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11907 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11908 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11909 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11910 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11911 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
11912 }
11913 /* Map 16-bit characters to '\uxxxx' */
11914 else {
11915 PyUnicode_WRITE(okind, odata, o++, '\\');
11916 PyUnicode_WRITE(okind, odata, o++, 'u');
11917 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11918 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11919 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
11921 }
11922 }
11923 /* Copy characters as-is */
11924 else {
11925 PyUnicode_WRITE(okind, odata, o++, ch);
11926 }
11927 }
11928 }
11929 /* Closing quote already added at the beginning */
11930 assert(_PyUnicode_CheckConsistency(repr, 1));
11931 return repr;
11932 }
11933
11934 PyDoc_STRVAR(rfind__doc__,
11935 "S.rfind(sub[, start[, end]]) -> int\n\
11936 \n\
11937 Return the highest index in S where substring sub is found,\n\
11938 such that sub is contained within S[start:end]. Optional\n\
11939 arguments start and end are interpreted as in slice notation.\n\
11940 \n\
11941 Return -1 on failure.");
11942
11943 static PyObject *
11944 unicode_rfind(PyObject *self, PyObject *args)
11945 {
11946 PyObject *substring;
11947 Py_ssize_t start;
11948 Py_ssize_t end;
11949 Py_ssize_t result;
11950
11951 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11952 &start, &end))
11953 return NULL;
11954
11955 if (PyUnicode_READY(self) == -1)
11956 return NULL;
11957 if (PyUnicode_READY(substring) == -1)
11958 return NULL;
11959
11960 result = any_find_slice(-1, self, substring, start, end);
11961
11962 Py_DECREF(substring);
11963
11964 if (result == -2)
11965 return NULL;
11966
11967 return PyLong_FromSsize_t(result);
11968 }
11969
11970 PyDoc_STRVAR(rindex__doc__,
11971 "S.rindex(sub[, start[, end]]) -> int\n\
11972 \n\
11973 Like S.rfind() but raise ValueError when the substring is not found.");
11974
11975 static PyObject *
11976 unicode_rindex(PyObject *self, PyObject *args)
11977 {
11978 PyObject *substring;
11979 Py_ssize_t start;
11980 Py_ssize_t end;
11981 Py_ssize_t result;
11982
11983 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11984 &start, &end))
11985 return NULL;
11986
11987 if (PyUnicode_READY(self) == -1)
11988 return NULL;
11989 if (PyUnicode_READY(substring) == -1)
11990 return NULL;
11991
11992 result = any_find_slice(-1, self, substring, start, end);
11993
11994 Py_DECREF(substring);
11995
11996 if (result == -2)
11997 return NULL;
11998
11999 if (result < 0) {
12000 PyErr_SetString(PyExc_ValueError, "substring not found");
12001 return NULL;
12002 }
12003
12004 return PyLong_FromSsize_t(result);
12005 }
12006
12007 PyDoc_STRVAR(rjust__doc__,
12008 "S.rjust(width[, fillchar]) -> str\n\
12009 \n\
12010 Return S right-justified in a string of length width. Padding is\n\
12011 done using the specified fill character (default is a space).");
12012
12013 static PyObject *
12014 unicode_rjust(PyObject *self, PyObject *args)
12015 {
12016 Py_ssize_t width;
12017 Py_UCS4 fillchar = ' ';
12018
12019 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12020 return NULL;
12021
12022 if (PyUnicode_READY(self) == -1)
12023 return NULL;
12024
12025 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
12026 Py_INCREF(self);
12027 return self;
12028 }
12029
12030 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
12031 }
12032
12033 PyObject *
12034 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12035 {
12036 PyObject *result;
12037
12038 s = PyUnicode_FromObject(s);
12039 if (s == NULL)
12040 return NULL;
12041 if (sep != NULL) {
12042 sep = PyUnicode_FromObject(sep);
12043 if (sep == NULL) {
12044 Py_DECREF(s);
12045 return NULL;
12046 }
12047 }
12048
12049 result = split(s, sep, maxsplit);
12050
12051 Py_DECREF(s);
12052 Py_XDECREF(sep);
12053 return result;
12054 }
12055
12056 PyDoc_STRVAR(split__doc__,
12057 "S.split([sep[, maxsplit]]) -> list of strings\n\
12058 \n\
12059 Return a list of the words in S, using sep as the\n\
12060 delimiter string. If maxsplit is given, at most maxsplit\n\
12061 splits are done. If sep is not specified or is None, any\n\
12062 whitespace string is a separator and empty strings are\n\
12063 removed from the result.");
12064
12065 static PyObject*
12066 unicode_split(PyObject *self, PyObject *args)
12067 {
12068 PyObject *substring = Py_None;
12069 Py_ssize_t maxcount = -1;
12070
12071 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
12072 return NULL;
12073
12074 if (substring == Py_None)
12075 return split(self, NULL, maxcount);
12076 else if (PyUnicode_Check(substring))
12077 return split(self, substring, maxcount);
12078 else
12079 return PyUnicode_Split(self, substring, maxcount);
12080 }
12081
12082 PyObject *
12083 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12084 {
12085 PyObject* str_obj;
12086 PyObject* sep_obj;
12087 PyObject* out;
12088 int kind1, kind2, kind;
12089 void *buf1 = NULL, *buf2 = NULL;
12090 Py_ssize_t len1, len2;
12091
12092 str_obj = PyUnicode_FromObject(str_in);
12093 if (!str_obj || PyUnicode_READY(str_obj) == -1)
12094 return NULL;
12095 sep_obj = PyUnicode_FromObject(sep_in);
12096 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
12097 Py_DECREF(str_obj);
12098 return NULL;
12099 }
12100
12101 kind1 = PyUnicode_KIND(str_obj);
12102 kind2 = PyUnicode_KIND(sep_obj);
12103 kind = Py_MAX(kind1, kind2);
12104 buf1 = PyUnicode_DATA(str_obj);
12105 if (kind1 != kind)
12106 buf1 = _PyUnicode_AsKind(str_obj, kind);
12107 if (!buf1)
12108 goto onError;
12109 buf2 = PyUnicode_DATA(sep_obj);
12110 if (kind2 != kind)
12111 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12112 if (!buf2)
12113 goto onError;
12114 len1 = PyUnicode_GET_LENGTH(str_obj);
12115 len2 = PyUnicode_GET_LENGTH(sep_obj);
12116
12117 switch(PyUnicode_KIND(str_obj)) {
12118 case PyUnicode_1BYTE_KIND:
12119 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12120 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12121 else
12122 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12123 break;
12124 case PyUnicode_2BYTE_KIND:
12125 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12126 break;
12127 case PyUnicode_4BYTE_KIND:
12128 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12129 break;
12130 default:
12131 assert(0);
12132 out = 0;
12133 }
12134
12135 Py_DECREF(sep_obj);
12136 Py_DECREF(str_obj);
12137 if (kind1 != kind)
12138 PyMem_Free(buf1);
12139 if (kind2 != kind)
12140 PyMem_Free(buf2);
12141
12142 return out;
12143 onError:
12144 Py_DECREF(sep_obj);
12145 Py_DECREF(str_obj);
12146 if (kind1 != kind && buf1)
12147 PyMem_Free(buf1);
12148 if (kind2 != kind && buf2)
12149 PyMem_Free(buf2);
12150 return NULL;
12151 }
12152
12153
12154 PyObject *
12155 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12156 {
12157 PyObject* str_obj;
12158 PyObject* sep_obj;
12159 PyObject* out;
12160 int kind1, kind2, kind;
12161 void *buf1 = NULL, *buf2 = NULL;
12162 Py_ssize_t len1, len2;
12163
12164 str_obj = PyUnicode_FromObject(str_in);
12165 if (!str_obj)
12166 return NULL;
12167 sep_obj = PyUnicode_FromObject(sep_in);
12168 if (!sep_obj) {
12169 Py_DECREF(str_obj);
12170 return NULL;
12171 }
12172
12173 kind1 = PyUnicode_KIND(str_in);
12174 kind2 = PyUnicode_KIND(sep_obj);
12175 kind = Py_MAX(kind1, kind2);
12176 buf1 = PyUnicode_DATA(str_in);
12177 if (kind1 != kind)
12178 buf1 = _PyUnicode_AsKind(str_in, kind);
12179 if (!buf1)
12180 goto onError;
12181 buf2 = PyUnicode_DATA(sep_obj);
12182 if (kind2 != kind)
12183 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12184 if (!buf2)
12185 goto onError;
12186 len1 = PyUnicode_GET_LENGTH(str_obj);
12187 len2 = PyUnicode_GET_LENGTH(sep_obj);
12188
12189 switch(PyUnicode_KIND(str_in)) {
12190 case PyUnicode_1BYTE_KIND:
12191 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12192 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12193 else
12194 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12195 break;
12196 case PyUnicode_2BYTE_KIND:
12197 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12198 break;
12199 case PyUnicode_4BYTE_KIND:
12200 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12201 break;
12202 default:
12203 assert(0);
12204 out = 0;
12205 }
12206
12207 Py_DECREF(sep_obj);
12208 Py_DECREF(str_obj);
12209 if (kind1 != kind)
12210 PyMem_Free(buf1);
12211 if (kind2 != kind)
12212 PyMem_Free(buf2);
12213
12214 return out;
12215 onError:
12216 Py_DECREF(sep_obj);
12217 Py_DECREF(str_obj);
12218 if (kind1 != kind && buf1)
12219 PyMem_Free(buf1);
12220 if (kind2 != kind && buf2)
12221 PyMem_Free(buf2);
12222 return NULL;
12223 }
12224
12225 PyDoc_STRVAR(partition__doc__,
12226 "S.partition(sep) -> (head, sep, tail)\n\
12227 \n\
12228 Search for the separator sep in S, and return the part before it,\n\
12229 the separator itself, and the part after it. If the separator is not\n\
12230 found, return S and two empty strings.");
12231
12232 static PyObject*
12233 unicode_partition(PyObject *self, PyObject *separator)
12234 {
12235 return PyUnicode_Partition(self, separator);
12236 }
12237
12238 PyDoc_STRVAR(rpartition__doc__,
12239 "S.rpartition(sep) -> (head, sep, tail)\n\
12240 \n\
12241 Search for the separator sep in S, starting at the end of S, and return\n\
12242 the part before it, the separator itself, and the part after it. If the\n\
12243 separator is not found, return two empty strings and S.");
12244
12245 static PyObject*
12246 unicode_rpartition(PyObject *self, PyObject *separator)
12247 {
12248 return PyUnicode_RPartition(self, separator);
12249 }
12250
12251 PyObject *
12252 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12253 {
12254 PyObject *result;
12255
12256 s = PyUnicode_FromObject(s);
12257 if (s == NULL)
12258 return NULL;
12259 if (sep != NULL) {
12260 sep = PyUnicode_FromObject(sep);
12261 if (sep == NULL) {
12262 Py_DECREF(s);
12263 return NULL;
12264 }
12265 }
12266
12267 result = rsplit(s, sep, maxsplit);
12268
12269 Py_DECREF(s);
12270 Py_XDECREF(sep);
12271 return result;
12272 }
12273
12274 PyDoc_STRVAR(rsplit__doc__,
12275 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
12276 \n\
12277 Return a list of the words in S, using sep as the\n\
12278 delimiter string, starting at the end of the string and\n\
12279 working to the front. If maxsplit is given, at most maxsplit\n\
12280 splits are done. If sep is not specified, any whitespace string\n\
12281 is a separator.");
12282
12283 static PyObject*
12284 unicode_rsplit(PyObject *self, PyObject *args)
12285 {
12286 PyObject *substring = Py_None;
12287 Py_ssize_t maxcount = -1;
12288
12289 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
12290 return NULL;
12291
12292 if (substring == Py_None)
12293 return rsplit(self, NULL, maxcount);
12294 else if (PyUnicode_Check(substring))
12295 return rsplit(self, substring, maxcount);
12296 else
12297 return PyUnicode_RSplit(self, substring, maxcount);
12298 }
12299
12300 PyDoc_STRVAR(splitlines__doc__,
12301 "S.splitlines([keepends]) -> list of strings\n\
12302 \n\
12303 Return a list of the lines in S, breaking at line boundaries.\n\
12304 Line breaks are not included in the resulting list unless keepends\n\
12305 is given and true.");
12306
12307 static PyObject*
12308 unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12309 {
12310 static char *kwlist[] = {"keepends", 0};
12311 int keepends = 0;
12312
12313 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12314 kwlist, &keepends))
12315 return NULL;
12316
12317 return PyUnicode_Splitlines(self, keepends);
12318 }
12319
12320 static
12321 PyObject *unicode_str(PyObject *self)
12322 {
12323 if (PyUnicode_CheckExact(self)) {
12324 Py_INCREF(self);
12325 return self;
12326 } else
12327 /* Subtype -- return genuine unicode string with the same value. */
12328 return PyUnicode_Copy(self);
12329 }
12330
12331 PyDoc_STRVAR(swapcase__doc__,
12332 "S.swapcase() -> str\n\
12333 \n\
12334 Return a copy of S with uppercase characters converted to lowercase\n\
12335 and vice versa.");
12336
12337 static PyObject*
12338 unicode_swapcase(PyObject *self)
12339 {
12340 return fixup(self, fixswapcase);
12341 }
12342
12343 PyDoc_STRVAR(maketrans__doc__,
12344 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12345 \n\
12346 Return a translation table usable for str.translate().\n\
12347 If there is only one argument, it must be a dictionary mapping Unicode\n\
12348 ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12349 Character keys will be then converted to ordinals.\n\
12350 If there are two arguments, they must be strings of equal length, and\n\
12351 in the resulting dictionary, each character in x will be mapped to the\n\
12352 character at the same position in y. If there is a third argument, it\n\
12353 must be a string, whose characters will be mapped to None in the result.");
12354
12355 static PyObject*
12356 unicode_maketrans(PyObject *null, PyObject *args)
12357 {
12358 PyObject *x, *y = NULL, *z = NULL;
12359 PyObject *new = NULL, *key, *value;
12360 Py_ssize_t i = 0;
12361 int res;
12362
12363 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12364 return NULL;
12365 new = PyDict_New();
12366 if (!new)
12367 return NULL;
12368 if (y != NULL) {
12369 int x_kind, y_kind, z_kind;
12370 void *x_data, *y_data, *z_data;
12371
12372 /* x must be a string too, of equal length */
12373 if (!PyUnicode_Check(x)) {
12374 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12375 "be a string if there is a second argument");
12376 goto err;
12377 }
12378 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12379 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12380 "arguments must have equal length");
12381 goto err;
12382 }
12383 /* create entries for translating chars in x to those in y */
12384 x_kind = PyUnicode_KIND(x);
12385 y_kind = PyUnicode_KIND(y);
12386 x_data = PyUnicode_DATA(x);
12387 y_data = PyUnicode_DATA(y);
12388 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12389 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12390 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12391 if (!key || !value)
12392 goto err;
12393 res = PyDict_SetItem(new, key, value);
12394 Py_DECREF(key);
12395 Py_DECREF(value);
12396 if (res < 0)
12397 goto err;
12398 }
12399 /* create entries for deleting chars in z */
12400 if (z != NULL) {
12401 z_kind = PyUnicode_KIND(z);
12402 z_data = PyUnicode_DATA(z);
12403 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12404 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12405 if (!key)
12406 goto err;
12407 res = PyDict_SetItem(new, key, Py_None);
12408 Py_DECREF(key);
12409 if (res < 0)
12410 goto err;
12411 }
12412 }
12413 } else {
12414 int kind;
12415 void *data;
12416
12417 /* x must be a dict */
12418 if (!PyDict_CheckExact(x)) {
12419 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12420 "to maketrans it must be a dict");
12421 goto err;
12422 }
12423 /* copy entries into the new dict, converting string keys to int keys */
12424 while (PyDict_Next(x, &i, &key, &value)) {
12425 if (PyUnicode_Check(key)) {
12426 /* convert string keys to integer keys */
12427 PyObject *newkey;
12428 if (PyUnicode_GET_LENGTH(key) != 1) {
12429 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12430 "table must be of length 1");
12431 goto err;
12432 }
12433 kind = PyUnicode_KIND(key);
12434 data = PyUnicode_DATA(key);
12435 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12436 if (!newkey)
12437 goto err;
12438 res = PyDict_SetItem(new, newkey, value);
12439 Py_DECREF(newkey);
12440 if (res < 0)
12441 goto err;
12442 } else if (PyLong_Check(key)) {
12443 /* just keep integer keys */
12444 if (PyDict_SetItem(new, key, value) < 0)
12445 goto err;
12446 } else {
12447 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12448 "be strings or integers");
12449 goto err;
12450 }
12451 }
12452 }
12453 return new;
12454 err:
12455 Py_DECREF(new);
12456 return NULL;
12457 }
12458
12459 PyDoc_STRVAR(translate__doc__,
12460 "S.translate(table) -> str\n\
12461 \n\
12462 Return a copy of the string S, where all characters have been mapped\n\
12463 through the given translation table, which must be a mapping of\n\
12464 Unicode ordinals to Unicode ordinals, strings, or None.\n\
12465 Unmapped characters are left untouched. Characters mapped to None\n\
12466 are deleted.");
12467
12468 static PyObject*
12469 unicode_translate(PyObject *self, PyObject *table)
12470 {
12471 return _PyUnicode_TranslateCharmap(self, table, "ignore");
12472 }
12473
12474 PyDoc_STRVAR(upper__doc__,
12475 "S.upper() -> str\n\
12476 \n\
12477 Return a copy of S converted to uppercase.");
12478
12479 static PyObject*
12480 unicode_upper(PyObject *self)
12481 {
12482 return fixup(self, fixupper);
12483 }
12484
12485 PyDoc_STRVAR(zfill__doc__,
12486 "S.zfill(width) -> str\n\
12487 \n\
12488 Pad a numeric string S with zeros on the left, to fill a field\n\
12489 of the specified width. The string S is never truncated.");
12490
12491 static PyObject *
12492 unicode_zfill(PyObject *self, PyObject *args)
12493 {
12494 Py_ssize_t fill;
12495 PyObject *u;
12496 Py_ssize_t width;
12497 int kind;
12498 void *data;
12499 Py_UCS4 chr;
12500
12501 if (PyUnicode_READY(self) == -1)
12502 return NULL;
12503
12504 if (!PyArg_ParseTuple(args, "n:zfill", &width))
12505 return NULL;
12506
12507 if (PyUnicode_GET_LENGTH(self) >= width) {
12508 if (PyUnicode_CheckExact(self)) {
12509 Py_INCREF(self);
12510 return self;
12511 }
12512 else
12513 return PyUnicode_Copy(self);
12514 }
12515
12516 fill = width - _PyUnicode_LENGTH(self);
12517
12518 u = pad(self, fill, 0, '0');
12519
12520 if (u == NULL)
12521 return NULL;
12522
12523 kind = PyUnicode_KIND(u);
12524 data = PyUnicode_DATA(u);
12525 chr = PyUnicode_READ(kind, data, fill);
12526
12527 if (chr == '+' || chr == '-') {
12528 /* move sign to beginning of string */
12529 PyUnicode_WRITE(kind, data, 0, chr);
12530 PyUnicode_WRITE(kind, data, fill, '0');
12531 }
12532
12533 assert(_PyUnicode_CheckConsistency(u, 1));
12534 return u;
12535 }
12536
12537 #if 0
12538 static PyObject *
12539 unicode__decimal2ascii(PyObject *self)
12540 {
12541 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12542 }
12543 #endif
12544
12545 PyDoc_STRVAR(startswith__doc__,
12546 "S.startswith(prefix[, start[, end]]) -> bool\n\
12547 \n\
12548 Return True if S starts with the specified prefix, False otherwise.\n\
12549 With optional start, test S beginning at that position.\n\
12550 With optional end, stop comparing S at that position.\n\
12551 prefix can also be a tuple of strings to try.");
12552
12553 static PyObject *
12554 unicode_startswith(PyObject *self,
12555 PyObject *args)
12556 {
12557 PyObject *subobj;
12558 PyObject *substring;
12559 Py_ssize_t start = 0;
12560 Py_ssize_t end = PY_SSIZE_T_MAX;
12561 int result;
12562
12563 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12564 return NULL;
12565 if (PyTuple_Check(subobj)) {
12566 Py_ssize_t i;
12567 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12568 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12569 if (substring == NULL)
12570 return NULL;
12571 result = tailmatch(self, substring, start, end, -1);
12572 Py_DECREF(substring);
12573 if (result) {
12574 Py_RETURN_TRUE;
12575 }
12576 }
12577 /* nothing matched */
12578 Py_RETURN_FALSE;
12579 }
12580 substring = PyUnicode_FromObject(subobj);
12581 if (substring == NULL) {
12582 if (PyErr_ExceptionMatches(PyExc_TypeError))
12583 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12584 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12585 return NULL;
12586 }
12587 result = tailmatch(self, substring, start, end, -1);
12588 Py_DECREF(substring);
12589 return PyBool_FromLong(result);
12590 }
12591
12592
12593 PyDoc_STRVAR(endswith__doc__,
12594 "S.endswith(suffix[, start[, end]]) -> bool\n\
12595 \n\
12596 Return True if S ends with the specified suffix, False otherwise.\n\
12597 With optional start, test S beginning at that position.\n\
12598 With optional end, stop comparing S at that position.\n\
12599 suffix can also be a tuple of strings to try.");
12600
12601 static PyObject *
12602 unicode_endswith(PyObject *self,
12603 PyObject *args)
12604 {
12605 PyObject *subobj;
12606 PyObject *substring;
12607 Py_ssize_t start = 0;
12608 Py_ssize_t end = PY_SSIZE_T_MAX;
12609 int result;
12610
12611 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12612 return NULL;
12613 if (PyTuple_Check(subobj)) {
12614 Py_ssize_t i;
12615 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12616 substring = PyUnicode_FromObject(
12617 PyTuple_GET_ITEM(subobj, i));
12618 if (substring == NULL)
12619 return NULL;
12620 result = tailmatch(self, substring, start, end, +1);
12621 Py_DECREF(substring);
12622 if (result) {
12623 Py_RETURN_TRUE;
12624 }
12625 }
12626 Py_RETURN_FALSE;
12627 }
12628 substring = PyUnicode_FromObject(subobj);
12629 if (substring == NULL) {
12630 if (PyErr_ExceptionMatches(PyExc_TypeError))
12631 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12632 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12633 return NULL;
12634 }
12635 result = tailmatch(self, substring, start, end, +1);
12636 Py_DECREF(substring);
12637 return PyBool_FromLong(result);
12638 }
12639
12640 #include "stringlib/unicode_format.h"
12641
12642 PyDoc_STRVAR(format__doc__,
12643 "S.format(*args, **kwargs) -> str\n\
12644 \n\
12645 Return a formatted version of S, using substitutions from args and kwargs.\n\
12646 The substitutions are identified by braces ('{' and '}').");
12647
12648 PyDoc_STRVAR(format_map__doc__,
12649 "S.format_map(mapping) -> str\n\
12650 \n\
12651 Return a formatted version of S, using substitutions from mapping.\n\
12652 The substitutions are identified by braces ('{' and '}').");
12653
12654 static PyObject *
12655 unicode__format__(PyObject* self, PyObject* args)
12656 {
12657 PyObject *format_spec, *out;
12658
12659 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12660 return NULL;
12661
12662 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
12663 PyUnicode_GET_LENGTH(format_spec));
12664 return out;
12665 }
12666
12667 PyDoc_STRVAR(p_format__doc__,
12668 "S.__format__(format_spec) -> str\n\
12669 \n\
12670 Return a formatted version of S as described by format_spec.");
12671
12672 static PyObject *
12673 unicode__sizeof__(PyObject *v)
12674 {
12675 Py_ssize_t size;
12676
12677 /* If it's a compact object, account for base structure +
12678 character data. */
12679 if (PyUnicode_IS_COMPACT_ASCII(v))
12680 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12681 else if (PyUnicode_IS_COMPACT(v))
12682 size = sizeof(PyCompactUnicodeObject) +
12683 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12684 else {
12685 /* If it is a two-block object, account for base object, and
12686 for character block if present. */
12687 size = sizeof(PyUnicodeObject);
12688 if (_PyUnicode_DATA_ANY(v))
12689 size += (PyUnicode_GET_LENGTH(v) + 1) *
12690 PyUnicode_KIND(v);
12691 }
12692 /* If the wstr pointer is present, account for it unless it is shared
12693 with the data pointer. Check if the data is not shared. */
12694 if (_PyUnicode_HAS_WSTR_MEMORY(v))
12695 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12696 if (_PyUnicode_HAS_UTF8_MEMORY(v))
12697 size += PyUnicode_UTF8_LENGTH(v) + 1;
12698
12699 return PyLong_FromSsize_t(size);
12700 }
12701
12702 PyDoc_STRVAR(sizeof__doc__,
12703 "S.__sizeof__() -> size of S in memory, in bytes");
12704
12705 static PyObject *
12706 unicode_getnewargs(PyObject *v)
12707 {
12708 PyObject *copy = PyUnicode_Copy(v);
12709 if (!copy)
12710 return NULL;
12711 return Py_BuildValue("(N)", copy);
12712 }
12713
12714 static PyMethodDef unicode_methods[] = {
12715
12716 /* Order is according to common usage: often used methods should
12717 appear first, since lookup is done sequentially. */
12718
12719 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12720 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12721 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12722 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12723 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12724 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12725 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12726 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12727 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12728 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12729 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12730 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12731 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12732 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12733 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12734 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12735 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12736 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12737 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12738 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12739 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12740 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12741 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12742 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12743 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12744 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12745 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12746 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12747 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12748 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12749 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12750 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12751 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12752 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12753 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12754 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12755 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12756 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12757 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12758 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12759 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12760 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12761 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12762 {"maketrans", (PyCFunction) unicode_maketrans,
12763 METH_VARARGS | METH_STATIC, maketrans__doc__},
12764 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12765 #if 0
12766 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12767 #endif
12768
12769 #if 0
12770 /* These methods are just used for debugging the implementation. */
12771 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12772 #endif
12773
12774 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
12775 {NULL, NULL}
12776 };
12777
12778 static PyObject *
12779 unicode_mod(PyObject *v, PyObject *w)
12780 {
12781 if (!PyUnicode_Check(v))
12782 Py_RETURN_NOTIMPLEMENTED;
12783 return PyUnicode_Format(v, w);
12784 }
12785
12786 static PyNumberMethods unicode_as_number = {
12787 0, /*nb_add*/
12788 0, /*nb_subtract*/
12789 0, /*nb_multiply*/
12790 unicode_mod, /*nb_remainder*/
12791 };
12792
12793 static PySequenceMethods unicode_as_sequence = {
12794 (lenfunc) unicode_length, /* sq_length */
12795 PyUnicode_Concat, /* sq_concat */
12796 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12797 (ssizeargfunc) unicode_getitem, /* sq_item */
12798 0, /* sq_slice */
12799 0, /* sq_ass_item */
12800 0, /* sq_ass_slice */
12801 PyUnicode_Contains, /* sq_contains */
12802 };
12803
12804 static PyObject*
12805 unicode_subscript(PyObject* self, PyObject* item)
12806 {
12807 if (PyUnicode_READY(self) == -1)
12808 return NULL;
12809
12810 if (PyIndex_Check(item)) {
12811 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12812 if (i == -1 && PyErr_Occurred())
12813 return NULL;
12814 if (i < 0)
12815 i += PyUnicode_GET_LENGTH(self);
12816 return unicode_getitem(self, i);
12817 } else if (PySlice_Check(item)) {
12818 Py_ssize_t start, stop, step, slicelength, cur, i;
12819 PyObject *result;
12820 void *src_data, *dest_data;
12821 int src_kind, dest_kind;
12822 Py_UCS4 ch, max_char, kind_limit;
12823
12824 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12825 &start, &stop, &step, &slicelength) < 0) {
12826 return NULL;
12827 }
12828
12829 if (slicelength <= 0) {
12830 return PyUnicode_New(0, 0);
12831 } else if (start == 0 && step == 1 &&
12832 slicelength == PyUnicode_GET_LENGTH(self) &&
12833 PyUnicode_CheckExact(self)) {
12834 Py_INCREF(self);
12835 return self;
12836 } else if (step == 1) {
12837 return PyUnicode_Substring(self,
12838 start, start + slicelength);
12839 }
12840 /* General case */
12841 src_kind = PyUnicode_KIND(self);
12842 src_data = PyUnicode_DATA(self);
12843 if (!PyUnicode_IS_ASCII(self)) {
12844 kind_limit = kind_maxchar_limit(src_kind);
12845 max_char = 0;
12846 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12847 ch = PyUnicode_READ(src_kind, src_data, cur);
12848 if (ch > max_char) {
12849 max_char = ch;
12850 if (max_char >= kind_limit)
12851 break;
12852 }
12853 }
12854 }
12855 else
12856 max_char = 127;
12857 result = PyUnicode_New(slicelength, max_char);
12858 if (result == NULL)
12859 return NULL;
12860 dest_kind = PyUnicode_KIND(result);
12861 dest_data = PyUnicode_DATA(result);
12862
12863 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12864 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12865 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
12866 }
12867 assert(_PyUnicode_CheckConsistency(result, 1));
12868 return result;
12869 } else {
12870 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12871 return NULL;
12872 }
12873 }
12874
12875 static PyMappingMethods unicode_as_mapping = {
12876 (lenfunc)unicode_length, /* mp_length */
12877 (binaryfunc)unicode_subscript, /* mp_subscript */
12878 (objobjargproc)0, /* mp_ass_subscript */
12879 };
12880
12881
12882 /* Helpers for PyUnicode_Format() */
12883
12884 static PyObject *
12885 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12886 {
12887 Py_ssize_t argidx = *p_argidx;
12888 if (argidx < arglen) {
12889 (*p_argidx)++;
12890 if (arglen < 0)
12891 return args;
12892 else
12893 return PyTuple_GetItem(args, argidx);
12894 }
12895 PyErr_SetString(PyExc_TypeError,
12896 "not enough arguments for format string");
12897 return NULL;
12898 }
12899
12900 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
12901
12902 static PyObject *
12903 formatfloat(PyObject *v, int flags, int prec, int type)
12904 {
12905 char *p;
12906 PyObject *result;
12907 double x;
12908
12909 x = PyFloat_AsDouble(v);
12910 if (x == -1.0 && PyErr_Occurred())
12911 return NULL;
12912
12913 if (prec < 0)
12914 prec = 6;
12915
12916 p = PyOS_double_to_string(x, type, prec,
12917 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
12918 if (p == NULL)
12919 return NULL;
12920 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
12921 PyMem_Free(p);
12922 return result;
12923 }
12924
12925 static PyObject*
12926 formatlong(PyObject *val, int flags, int prec, int type)
12927 {
12928 char *buf;
12929 int len;
12930 PyObject *str; /* temporary string object. */
12931 PyObject *result;
12932
12933 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12934 if (!str)
12935 return NULL;
12936 result = PyUnicode_DecodeASCII(buf, len, NULL);
12937 Py_DECREF(str);
12938 return result;
12939 }
12940
12941 static Py_UCS4
12942 formatchar(PyObject *v)
12943 {
12944 /* presume that the buffer is at least 3 characters long */
12945 if (PyUnicode_Check(v)) {
12946 if (PyUnicode_GET_LENGTH(v) == 1) {
12947 return PyUnicode_READ_CHAR(v, 0);
12948 }
12949 goto onError;
12950 }
12951 else {
12952 /* Integer input truncated to a character */
12953 long x;
12954 x = PyLong_AsLong(v);
12955 if (x == -1 && PyErr_Occurred())
12956 goto onError;
12957
12958 if (x < 0 || x > MAX_UNICODE) {
12959 PyErr_SetString(PyExc_OverflowError,
12960 "%c arg not in range(0x110000)");
12961 return (Py_UCS4) -1;
12962 }
12963
12964 return (Py_UCS4) x;
12965 }
12966
12967 onError:
12968 PyErr_SetString(PyExc_TypeError,
12969 "%c requires int or char");
12970 return (Py_UCS4) -1;
12971 }
12972
12973 static int
12974 repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12975 {
12976 int r;
12977 assert(count > 0);
12978 assert(PyUnicode_Check(obj));
12979 if (count > 5) {
12980 PyObject *repeated = unicode_repeat(obj, count);
12981 if (repeated == NULL)
12982 return -1;
12983 r = _PyAccu_Accumulate(acc, repeated);
12984 Py_DECREF(repeated);
12985 return r;
12986 }
12987 else {
12988 do {
12989 if (_PyAccu_Accumulate(acc, obj))
12990 return -1;
12991 } while (--count);
12992 return 0;
12993 }
12994 }
12995
12996 PyObject *
12997 PyUnicode_Format(PyObject *format, PyObject *args)
12998 {
12999 void *fmt;
13000 int fmtkind;
13001 PyObject *result;
13002 int kind;
13003 int r;
13004 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
13005 int args_owned = 0;
13006 PyObject *dict = NULL;
13007 PyObject *temp = NULL;
13008 PyObject *second = NULL;
13009 PyObject *uformat;
13010 _PyAccu acc;
13011 static PyObject *plus, *minus, *blank, *zero, *percent;
13012
13013 if (!plus && !(plus = get_latin1_char('+')))
13014 return NULL;
13015 if (!minus && !(minus = get_latin1_char('-')))
13016 return NULL;
13017 if (!blank && !(blank = get_latin1_char(' ')))
13018 return NULL;
13019 if (!zero && !(zero = get_latin1_char('0')))
13020 return NULL;
13021 if (!percent && !(percent = get_latin1_char('%')))
13022 return NULL;
13023
13024 if (format == NULL || args == NULL) {
13025 PyErr_BadInternalCall();
13026 return NULL;
13027 }
13028 uformat = PyUnicode_FromObject(format);
13029 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
13030 return NULL;
13031 if (_PyAccu_Init(&acc))
13032 goto onError;
13033 fmt = PyUnicode_DATA(uformat);
13034 fmtkind = PyUnicode_KIND(uformat);
13035 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13036 fmtpos = 0;
13037
13038 if (PyTuple_Check(args)) {
13039 arglen = PyTuple_Size(args);
13040 argidx = 0;
13041 }
13042 else {
13043 arglen = -1;
13044 argidx = -2;
13045 }
13046 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
13047 !PyUnicode_Check(args))
13048 dict = args;
13049
13050 while (--fmtcnt >= 0) {
13051 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13052 PyObject *nonfmt;
13053 Py_ssize_t nonfmtpos;
13054 nonfmtpos = fmtpos++;
13055 while (fmtcnt >= 0 &&
13056 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13057 fmtpos++;
13058 fmtcnt--;
13059 }
13060 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
13061 if (nonfmt == NULL)
13062 goto onError;
13063 r = _PyAccu_Accumulate(&acc, nonfmt);
13064 Py_DECREF(nonfmt);
13065 if (r)
13066 goto onError;
13067 }
13068 else {
13069 /* Got a format specifier */
13070 int flags = 0;
13071 Py_ssize_t width = -1;
13072 int prec = -1;
13073 Py_UCS4 c = '\0';
13074 Py_UCS4 fill, sign;
13075 int isnumok;
13076 PyObject *v = NULL;
13077 void *pbuf = NULL;
13078 Py_ssize_t pindex, len;
13079 PyObject *signobj = NULL, *fillobj = NULL;
13080
13081 fmtpos++;
13082 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13083 Py_ssize_t keystart;
13084 Py_ssize_t keylen;
13085 PyObject *key;
13086 int pcount = 1;
13087
13088 if (dict == NULL) {
13089 PyErr_SetString(PyExc_TypeError,
13090 "format requires a mapping");
13091 goto onError;
13092 }
13093 ++fmtpos;
13094 --fmtcnt;
13095 keystart = fmtpos;
13096 /* Skip over balanced parentheses */
13097 while (pcount > 0 && --fmtcnt >= 0) {
13098 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
13099 --pcount;
13100 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
13101 ++pcount;
13102 fmtpos++;
13103 }
13104 keylen = fmtpos - keystart - 1;
13105 if (fmtcnt < 0 || pcount > 0) {
13106 PyErr_SetString(PyExc_ValueError,
13107 "incomplete format key");
13108 goto onError;
13109 }
13110 key = PyUnicode_Substring(uformat,
13111 keystart, keystart + keylen);
13112 if (key == NULL)
13113 goto onError;
13114 if (args_owned) {
13115 Py_DECREF(args);
13116 args_owned = 0;
13117 }
13118 args = PyObject_GetItem(dict, key);
13119 Py_DECREF(key);
13120 if (args == NULL) {
13121 goto onError;
13122 }
13123 args_owned = 1;
13124 arglen = -1;
13125 argidx = -2;
13126 }
13127 while (--fmtcnt >= 0) {
13128 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
13129 case '-': flags |= F_LJUST; continue;
13130 case '+': flags |= F_SIGN; continue;
13131 case ' ': flags |= F_BLANK; continue;
13132 case '#': flags |= F_ALT; continue;
13133 case '0': flags |= F_ZERO; continue;
13134 }
13135 break;
13136 }
13137 if (c == '*') {
13138 v = getnextarg(args, arglen, &argidx);
13139 if (v == NULL)
13140 goto onError;
13141 if (!PyLong_Check(v)) {
13142 PyErr_SetString(PyExc_TypeError,
13143 "* wants int");
13144 goto onError;
13145 }
13146 width = PyLong_AsLong(v);
13147 if (width == -1 && PyErr_Occurred())
13148 goto onError;
13149 if (width < 0) {
13150 flags |= F_LJUST;
13151 width = -width;
13152 }
13153 if (--fmtcnt >= 0)
13154 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13155 }
13156 else if (c >= '0' && c <= '9') {
13157 width = c - '0';
13158 while (--fmtcnt >= 0) {
13159 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13160 if (c < '0' || c > '9')
13161 break;
13162 if ((width*10) / 10 != width) {
13163 PyErr_SetString(PyExc_ValueError,
13164 "width too big");
13165 goto onError;
13166 }
13167 width = width*10 + (c - '0');
13168 }
13169 }
13170 if (c == '.') {
13171 prec = 0;
13172 if (--fmtcnt >= 0)
13173 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13174 if (c == '*') {
13175 v = getnextarg(args, arglen, &argidx);
13176 if (v == NULL)
13177 goto onError;
13178 if (!PyLong_Check(v)) {
13179 PyErr_SetString(PyExc_TypeError,
13180 "* wants int");
13181 goto onError;
13182 }
13183 prec = PyLong_AsLong(v);
13184 if (prec == -1 && PyErr_Occurred())
13185 goto onError;
13186 if (prec < 0)
13187 prec = 0;
13188 if (--fmtcnt >= 0)
13189 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13190 }
13191 else if (c >= '0' && c <= '9') {
13192 prec = c - '0';
13193 while (--fmtcnt >= 0) {
13194 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13195 if (c < '0' || c > '9')
13196 break;
13197 if ((prec*10) / 10 != prec) {
13198 PyErr_SetString(PyExc_ValueError,
13199 "prec too big");
13200 goto onError;
13201 }
13202 prec = prec*10 + (c - '0');
13203 }
13204 }
13205 } /* prec */
13206 if (fmtcnt >= 0) {
13207 if (c == 'h' || c == 'l' || c == 'L') {
13208 if (--fmtcnt >= 0)
13209 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13210 }
13211 }
13212 if (fmtcnt < 0) {
13213 PyErr_SetString(PyExc_ValueError,
13214 "incomplete format");
13215 goto onError;
13216 }
13217 if (c != '%') {
13218 v = getnextarg(args, arglen, &argidx);
13219 if (v == NULL)
13220 goto onError;
13221 }
13222 sign = 0;
13223 fill = ' ';
13224 fillobj = blank;
13225 switch (c) {
13226
13227 case '%':
13228 _PyAccu_Accumulate(&acc, percent);
13229 continue;
13230
13231 case 's':
13232 case 'r':
13233 case 'a':
13234 if (PyUnicode_CheckExact(v) && c == 's') {
13235 temp = v;
13236 Py_INCREF(temp);
13237 }
13238 else {
13239 if (c == 's')
13240 temp = PyObject_Str(v);
13241 else if (c == 'r')
13242 temp = PyObject_Repr(v);
13243 else
13244 temp = PyObject_ASCII(v);
13245 if (temp == NULL)
13246 goto onError;
13247 if (PyUnicode_Check(temp))
13248 /* nothing to do */;
13249 else {
13250 Py_DECREF(temp);
13251 PyErr_SetString(PyExc_TypeError,
13252 "%s argument has non-string str()");
13253 goto onError;
13254 }
13255 }
13256 if (PyUnicode_READY(temp) == -1) {
13257 Py_CLEAR(temp);
13258 goto onError;
13259 }
13260 pbuf = PyUnicode_DATA(temp);
13261 kind = PyUnicode_KIND(temp);
13262 len = PyUnicode_GET_LENGTH(temp);
13263 if (prec >= 0 && len > prec)
13264 len = prec;
13265 break;
13266
13267 case 'i':
13268 case 'd':
13269 case 'u':
13270 case 'o':
13271 case 'x':
13272 case 'X':
13273 isnumok = 0;
13274 if (PyNumber_Check(v)) {
13275 PyObject *iobj=NULL;
13276
13277 if (PyLong_Check(v)) {
13278 iobj = v;
13279 Py_INCREF(iobj);
13280 }
13281 else {
13282 iobj = PyNumber_Long(v);
13283 }
13284 if (iobj!=NULL) {
13285 if (PyLong_Check(iobj)) {
13286 isnumok = 1;
13287 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13288 Py_DECREF(iobj);
13289 if (!temp)
13290 goto onError;
13291 if (PyUnicode_READY(temp) == -1) {
13292 Py_CLEAR(temp);
13293 goto onError;
13294 }
13295 pbuf = PyUnicode_DATA(temp);
13296 kind = PyUnicode_KIND(temp);
13297 len = PyUnicode_GET_LENGTH(temp);
13298 sign = 1;
13299 }
13300 else {
13301 Py_DECREF(iobj);
13302 }
13303 }
13304 }
13305 if (!isnumok) {
13306 PyErr_Format(PyExc_TypeError,
13307 "%%%c format: a number is required, "
13308 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13309 goto onError;
13310 }
13311 if (flags & F_ZERO) {
13312 fill = '0';
13313 fillobj = zero;
13314 }
13315 break;
13316
13317 case 'e':
13318 case 'E':
13319 case 'f':
13320 case 'F':
13321 case 'g':
13322 case 'G':
13323 temp = formatfloat(v, flags, prec, c);
13324 if (!temp)
13325 goto onError;
13326 if (PyUnicode_READY(temp) == -1) {
13327 Py_CLEAR(temp);
13328 goto onError;
13329 }
13330 pbuf = PyUnicode_DATA(temp);
13331 kind = PyUnicode_KIND(temp);
13332 len = PyUnicode_GET_LENGTH(temp);
13333 sign = 1;
13334 if (flags & F_ZERO) {
13335 fill = '0';
13336 fillobj = zero;
13337 }
13338 break;
13339
13340 case 'c':
13341 {
13342 Py_UCS4 ch = formatchar(v);
13343 if (ch == (Py_UCS4) -1)
13344 goto onError;
13345 temp = _PyUnicode_FromUCS4(&ch, 1);
13346 if (temp == NULL)
13347 goto onError;
13348 pbuf = PyUnicode_DATA(temp);
13349 kind = PyUnicode_KIND(temp);
13350 len = PyUnicode_GET_LENGTH(temp);
13351 break;
13352 }
13353
13354 default:
13355 PyErr_Format(PyExc_ValueError,
13356 "unsupported format character '%c' (0x%x) "
13357 "at index %zd",
13358 (31<=c && c<=126) ? (char)c : '?',
13359 (int)c,
13360 fmtpos - 1);
13361 goto onError;
13362 }
13363 /* pbuf is initialized here. */
13364 pindex = 0;
13365 if (sign) {
13366 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13367 signobj = minus;
13368 len--;
13369 pindex++;
13370 }
13371 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13372 signobj = plus;
13373 len--;
13374 pindex++;
13375 }
13376 else if (flags & F_SIGN)
13377 signobj = plus;
13378 else if (flags & F_BLANK)
13379 signobj = blank;
13380 else
13381 sign = 0;
13382 }
13383 if (width < len)
13384 width = len;
13385 if (sign) {
13386 if (fill != ' ') {
13387 assert(signobj != NULL);
13388 if (_PyAccu_Accumulate(&acc, signobj))
13389 goto onError;
13390 }
13391 if (width > len)
13392 width--;
13393 }
13394 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13395 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13396 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13397 if (fill != ' ') {
13398 second = get_latin1_char(
13399 PyUnicode_READ(kind, pbuf, pindex + 1));
13400 pindex += 2;
13401 if (second == NULL ||
13402 _PyAccu_Accumulate(&acc, zero) ||
13403 _PyAccu_Accumulate(&acc, second))
13404 goto onError;
13405 Py_CLEAR(second);
13406 }
13407 width -= 2;
13408 if (width < 0)
13409 width = 0;
13410 len -= 2;
13411 }
13412 if (width > len && !(flags & F_LJUST)) {
13413 assert(fillobj != NULL);
13414 if (repeat_accumulate(&acc, fillobj, width - len))
13415 goto onError;
13416 width = len;
13417 }
13418 if (fill == ' ') {
13419 if (sign) {
13420 assert(signobj != NULL);
13421 if (_PyAccu_Accumulate(&acc, signobj))
13422 goto onError;
13423 }
13424 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13425 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13426 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13427 second = get_latin1_char(
13428 PyUnicode_READ(kind, pbuf, pindex + 1));
13429 pindex += 2;
13430 if (second == NULL ||
13431 _PyAccu_Accumulate(&acc, zero) ||
13432 _PyAccu_Accumulate(&acc, second))
13433 goto onError;
13434 Py_CLEAR(second);
13435 }
13436 }
13437 /* Copy all characters, preserving len */
13438 if (temp != NULL) {
13439 assert(pbuf == PyUnicode_DATA(temp));
13440 v = PyUnicode_Substring(temp, pindex, pindex + len);
13441 }
13442 else {
13443 const char *p = (const char *) pbuf;
13444 assert(pbuf != NULL);
13445 p += kind * pindex;
13446 v = PyUnicode_FromKindAndData(kind, p, len);
13447 }
13448 if (v == NULL)
13449 goto onError;
13450 r = _PyAccu_Accumulate(&acc, v);
13451 Py_DECREF(v);
13452 if (r)
13453 goto onError;
13454 if (width > len && repeat_accumulate(&acc, blank, width - len))
13455 goto onError;
13456 if (dict && (argidx < arglen) && c != '%') {
13457 PyErr_SetString(PyExc_TypeError,
13458 "not all arguments converted during string formatting");
13459 goto onError;
13460 }
13461 Py_CLEAR(temp);
13462 } /* '%' */
13463 } /* until end */
13464 if (argidx < arglen && !dict) {
13465 PyErr_SetString(PyExc_TypeError,
13466 "not all arguments converted during string formatting");
13467 goto onError;
13468 }
13469
13470 result = _PyAccu_Finish(&acc);
13471 if (args_owned) {
13472 Py_DECREF(args);
13473 }
13474 Py_DECREF(uformat);
13475 Py_XDECREF(temp);
13476 Py_XDECREF(second);
13477 return result;
13478
13479 onError:
13480 Py_DECREF(uformat);
13481 Py_XDECREF(temp);
13482 Py_XDECREF(second);
13483 _PyAccu_Destroy(&acc);
13484 if (args_owned) {
13485 Py_DECREF(args);
13486 }
13487 return NULL;
13488 }
13489
13490 static PyObject *
13491 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13492
13493 static PyObject *
13494 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13495 {
13496 PyObject *x = NULL;
13497 static char *kwlist[] = {"object", "encoding", "errors", 0};
13498 char *encoding = NULL;
13499 char *errors = NULL;
13500
13501 if (type != &PyUnicode_Type)
13502 return unicode_subtype_new(type, args, kwds);
13503 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13504 kwlist, &x, &encoding, &errors))
13505 return NULL;
13506 if (x == NULL)
13507 return PyUnicode_New(0, 0);
13508 if (encoding == NULL && errors == NULL)
13509 return PyObject_Str(x);
13510 else
13511 return PyUnicode_FromEncodedObject(x, encoding, errors);
13512 }
13513
13514 static PyObject *
13515 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13516 {
13517 PyObject *unicode, *self;
13518 Py_ssize_t length, char_size;
13519 int share_wstr, share_utf8;
13520 unsigned int kind;
13521 void *data;
13522
13523 assert(PyType_IsSubtype(type, &PyUnicode_Type));
13524
13525 unicode = unicode_new(&PyUnicode_Type, args, kwds);
13526 if (unicode == NULL)
13527 return NULL;
13528 assert(_PyUnicode_CHECK(unicode));
13529 if (PyUnicode_READY(unicode))
13530 return NULL;
13531
13532 self = type->tp_alloc(type, 0);
13533 if (self == NULL) {
13534 Py_DECREF(unicode);
13535 return NULL;
13536 }
13537 kind = PyUnicode_KIND(unicode);
13538 length = PyUnicode_GET_LENGTH(unicode);
13539
13540 _PyUnicode_LENGTH(self) = length;
13541 #ifdef Py_DEBUG
13542 _PyUnicode_HASH(self) = -1;
13543 #else
13544 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13545 #endif
13546 _PyUnicode_STATE(self).interned = 0;
13547 _PyUnicode_STATE(self).kind = kind;
13548 _PyUnicode_STATE(self).compact = 0;
13549 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13550 _PyUnicode_STATE(self).ready = 1;
13551 _PyUnicode_WSTR(self) = NULL;
13552 _PyUnicode_UTF8_LENGTH(self) = 0;
13553 _PyUnicode_UTF8(self) = NULL;
13554 _PyUnicode_WSTR_LENGTH(self) = 0;
13555 _PyUnicode_DATA_ANY(self) = NULL;
13556
13557 share_utf8 = 0;
13558 share_wstr = 0;
13559 if (kind == PyUnicode_1BYTE_KIND) {
13560 char_size = 1;
13561 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13562 share_utf8 = 1;
13563 }
13564 else if (kind == PyUnicode_2BYTE_KIND) {
13565 char_size = 2;
13566 if (sizeof(wchar_t) == 2)
13567 share_wstr = 1;
13568 }
13569 else {
13570 assert(kind == PyUnicode_4BYTE_KIND);
13571 char_size = 4;
13572 if (sizeof(wchar_t) == 4)
13573 share_wstr = 1;
13574 }
13575
13576 /* Ensure we won't overflow the length. */
13577 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13578 PyErr_NoMemory();
13579 goto onError;
13580 }
13581 data = PyObject_MALLOC((length + 1) * char_size);
13582 if (data == NULL) {
13583 PyErr_NoMemory();
13584 goto onError;
13585 }
13586
13587 _PyUnicode_DATA_ANY(self) = data;
13588 if (share_utf8) {
13589 _PyUnicode_UTF8_LENGTH(self) = length;
13590 _PyUnicode_UTF8(self) = data;
13591 }
13592 if (share_wstr) {
13593 _PyUnicode_WSTR_LENGTH(self) = length;
13594 _PyUnicode_WSTR(self) = (wchar_t *)data;
13595 }
13596
13597 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13598 kind * (length + 1));
13599 assert(_PyUnicode_CheckConsistency(self, 1));
13600 #ifdef Py_DEBUG
13601 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13602 #endif
13603 Py_DECREF(unicode);
13604 return self;
13605
13606 onError:
13607 Py_DECREF(unicode);
13608 Py_DECREF(self);
13609 return NULL;
13610 }
13611
13612 PyDoc_STRVAR(unicode_doc,
13613 "str(string[, encoding[, errors]]) -> str\n\
13614 \n\
13615 Create a new string object from the given encoded string.\n\
13616 encoding defaults to the current default string encoding.\n\
13617 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
13618
13619 static PyObject *unicode_iter(PyObject *seq);
13620
13621 PyTypeObject PyUnicode_Type = {
13622 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13623 "str", /* tp_name */
13624 sizeof(PyUnicodeObject), /* tp_size */
13625 0, /* tp_itemsize */
13626 /* Slots */
13627 (destructor)unicode_dealloc, /* tp_dealloc */
13628 0, /* tp_print */
13629 0, /* tp_getattr */
13630 0, /* tp_setattr */
13631 0, /* tp_reserved */
13632 unicode_repr, /* tp_repr */
13633 &unicode_as_number, /* tp_as_number */
13634 &unicode_as_sequence, /* tp_as_sequence */
13635 &unicode_as_mapping, /* tp_as_mapping */
13636 (hashfunc) unicode_hash, /* tp_hash*/
13637 0, /* tp_call*/
13638 (reprfunc) unicode_str, /* tp_str */
13639 PyObject_GenericGetAttr, /* tp_getattro */
13640 0, /* tp_setattro */
13641 0, /* tp_as_buffer */
13642 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
13643 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
13644 unicode_doc, /* tp_doc */
13645 0, /* tp_traverse */
13646 0, /* tp_clear */
13647 PyUnicode_RichCompare, /* tp_richcompare */
13648 0, /* tp_weaklistoffset */
13649 unicode_iter, /* tp_iter */
13650 0, /* tp_iternext */
13651 unicode_methods, /* tp_methods */
13652 0, /* tp_members */
13653 0, /* tp_getset */
13654 &PyBaseObject_Type, /* tp_base */
13655 0, /* tp_dict */
13656 0, /* tp_descr_get */
13657 0, /* tp_descr_set */
13658 0, /* tp_dictoffset */
13659 0, /* tp_init */
13660 0, /* tp_alloc */
13661 unicode_new, /* tp_new */
13662 PyObject_Del, /* tp_free */
13663 };
13664
13665 /* Initialize the Unicode implementation */
13666
13667 int _PyUnicode_Init(void)
13668 {
13669 int i;
13670
13671 /* XXX - move this array to unicodectype.c ? */
13672 Py_UCS2 linebreak[] = {
13673 0x000A, /* LINE FEED */
13674 0x000D, /* CARRIAGE RETURN */
13675 0x001C, /* FILE SEPARATOR */
13676 0x001D, /* GROUP SEPARATOR */
13677 0x001E, /* RECORD SEPARATOR */
13678 0x0085, /* NEXT LINE */
13679 0x2028, /* LINE SEPARATOR */
13680 0x2029, /* PARAGRAPH SEPARATOR */
13681 };
13682
13683 /* Init the implementation */
13684 unicode_empty = PyUnicode_New(0, 0);
13685 if (!unicode_empty)
13686 Py_FatalError("Can't create empty string");
13687 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
13688
13689 for (i = 0; i < 256; i++)
13690 unicode_latin1[i] = NULL;
13691 if (PyType_Ready(&PyUnicode_Type) < 0)
13692 Py_FatalError("Can't initialize 'unicode'");
13693
13694 /* initialize the linebreak bloom filter */
13695 bloom_linebreak = make_bloom_mask(
13696 PyUnicode_2BYTE_KIND, linebreak,
13697 Py_ARRAY_LENGTH(linebreak));
13698
13699 PyType_Ready(&EncodingMapType);
13700
13701 #ifdef HAVE_MBCS
13702 winver.dwOSVersionInfoSize = sizeof(winver);
13703 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13704 PyErr_SetFromWindowsErr(0);
13705 return -1;
13706 }
13707 #endif
13708 return 0;
13709 }
13710
13711 /* Finalize the Unicode implementation */
13712
13713 int
13714 PyUnicode_ClearFreeList(void)
13715 {
13716 return 0;
13717 }
13718
13719 void
13720 _PyUnicode_Fini(void)
13721 {
13722 int i;
13723
13724 Py_XDECREF(unicode_empty);
13725 unicode_empty = NULL;
13726
13727 for (i = 0; i < 256; i++) {
13728 if (unicode_latin1[i]) {
13729 Py_DECREF(unicode_latin1[i]);
13730 unicode_latin1[i] = NULL;
13731 }
13732 }
13733 _PyUnicode_ClearStaticStrings();
13734 (void)PyUnicode_ClearFreeList();
13735 }
13736
13737 void
13738 PyUnicode_InternInPlace(PyObject **p)
13739 {
13740 register PyObject *s = *p;
13741 PyObject *t;
13742 #ifdef Py_DEBUG
13743 assert(s != NULL);
13744 assert(_PyUnicode_CHECK(s));
13745 #else
13746 if (s == NULL || !PyUnicode_Check(s))
13747 return;
13748 #endif
13749 /* If it's a subclass, we don't really know what putting
13750 it in the interned dict might do. */
13751 if (!PyUnicode_CheckExact(s))
13752 return;
13753 if (PyUnicode_CHECK_INTERNED(s))
13754 return;
13755 if (interned == NULL) {
13756 interned = PyDict_New();
13757 if (interned == NULL) {
13758 PyErr_Clear(); /* Don't leave an exception */
13759 return;
13760 }
13761 }
13762 /* It might be that the GetItem call fails even
13763 though the key is present in the dictionary,
13764 namely when this happens during a stack overflow. */
13765 Py_ALLOW_RECURSION
13766 t = PyDict_GetItem(interned, s);
13767 Py_END_ALLOW_RECURSION
13768
13769 if (t) {
13770 Py_INCREF(t);
13771 Py_DECREF(*p);
13772 *p = t;
13773 return;
13774 }
13775
13776 PyThreadState_GET()->recursion_critical = 1;
13777 if (PyDict_SetItem(interned, s, s) < 0) {
13778 PyErr_Clear();
13779 PyThreadState_GET()->recursion_critical = 0;
13780 return;
13781 }
13782 PyThreadState_GET()->recursion_critical = 0;
13783 /* The two references in interned are not counted by refcnt.
13784 The deallocator will take care of this */
13785 Py_REFCNT(s) -= 2;
13786 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13787 }
13788
13789 void
13790 PyUnicode_InternImmortal(PyObject **p)
13791 {
13792 PyUnicode_InternInPlace(p);
13793 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13794 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
13795 Py_INCREF(*p);
13796 }
13797 }
13798
13799 PyObject *
13800 PyUnicode_InternFromString(const char *cp)
13801 {
13802 PyObject *s = PyUnicode_FromString(cp);
13803 if (s == NULL)
13804 return NULL;
13805 PyUnicode_InternInPlace(&s);
13806 return s;
13807 }
13808
13809 void
13810 _Py_ReleaseInternedUnicodeStrings(void)
13811 {
13812 PyObject *keys;
13813 PyObject *s;
13814 Py_ssize_t i, n;
13815 Py_ssize_t immortal_size = 0, mortal_size = 0;
13816
13817 if (interned == NULL || !PyDict_Check(interned))
13818 return;
13819 keys = PyDict_Keys(interned);
13820 if (keys == NULL || !PyList_Check(keys)) {
13821 PyErr_Clear();
13822 return;
13823 }
13824
13825 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13826 detector, interned unicode strings are not forcibly deallocated;
13827 rather, we give them their stolen references back, and then clear
13828 and DECREF the interned dict. */
13829
13830 n = PyList_GET_SIZE(keys);
13831 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13832 n);
13833 for (i = 0; i < n; i++) {
13834 s = PyList_GET_ITEM(keys, i);
13835 if (PyUnicode_READY(s) == -1) {
13836 assert(0 && "could not ready string");
13837 fprintf(stderr, "could not ready string\n");
13838 }
13839 switch (PyUnicode_CHECK_INTERNED(s)) {
13840 case SSTATE_NOT_INTERNED:
13841 /* XXX Shouldn't happen */
13842 break;
13843 case SSTATE_INTERNED_IMMORTAL:
13844 Py_REFCNT(s) += 1;
13845 immortal_size += PyUnicode_GET_LENGTH(s);
13846 break;
13847 case SSTATE_INTERNED_MORTAL:
13848 Py_REFCNT(s) += 2;
13849 mortal_size += PyUnicode_GET_LENGTH(s);
13850 break;
13851 default:
13852 Py_FatalError("Inconsistent interned string state.");
13853 }
13854 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13855 }
13856 fprintf(stderr, "total size of all interned strings: "
13857 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13858 "mortal/immortal\n", mortal_size, immortal_size);
13859 Py_DECREF(keys);
13860 PyDict_Clear(interned);
13861 Py_DECREF(interned);
13862 interned = NULL;
13863 }
13864
13865
13866 /********************* Unicode Iterator **************************/
13867
13868 typedef struct {
13869 PyObject_HEAD
13870 Py_ssize_t it_index;
13871 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
13872 } unicodeiterobject;
13873
13874 static void
13875 unicodeiter_dealloc(unicodeiterobject *it)
13876 {
13877 _PyObject_GC_UNTRACK(it);
13878 Py_XDECREF(it->it_seq);
13879 PyObject_GC_Del(it);
13880 }
13881
13882 static int
13883 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13884 {
13885 Py_VISIT(it->it_seq);
13886 return 0;
13887 }
13888
13889 static PyObject *
13890 unicodeiter_next(unicodeiterobject *it)
13891 {
13892 PyObject *seq, *item;
13893
13894 assert(it != NULL);
13895 seq = it->it_seq;
13896 if (seq == NULL)
13897 return NULL;
13898 assert(_PyUnicode_CHECK(seq));
13899
13900 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13901 int kind = PyUnicode_KIND(seq);
13902 void *data = PyUnicode_DATA(seq);
13903 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13904 item = PyUnicode_FromOrdinal(chr);
13905 if (item != NULL)
13906 ++it->it_index;
13907 return item;
13908 }
13909
13910 Py_DECREF(seq);
13911 it->it_seq = NULL;
13912 return NULL;
13913 }
13914
13915 static PyObject *
13916 unicodeiter_len(unicodeiterobject *it)
13917 {
13918 Py_ssize_t len = 0;
13919 if (it->it_seq)
13920 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
13921 return PyLong_FromSsize_t(len);
13922 }
13923
13924 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13925
13926 static PyMethodDef unicodeiter_methods[] = {
13927 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
13928 length_hint_doc},
13929 {NULL, NULL} /* sentinel */
13930 };
13931
13932 PyTypeObject PyUnicodeIter_Type = {
13933 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13934 "str_iterator", /* tp_name */
13935 sizeof(unicodeiterobject), /* tp_basicsize */
13936 0, /* tp_itemsize */
13937 /* methods */
13938 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13939 0, /* tp_print */
13940 0, /* tp_getattr */
13941 0, /* tp_setattr */
13942 0, /* tp_reserved */
13943 0, /* tp_repr */
13944 0, /* tp_as_number */
13945 0, /* tp_as_sequence */
13946 0, /* tp_as_mapping */
13947 0, /* tp_hash */
13948 0, /* tp_call */
13949 0, /* tp_str */
13950 PyObject_GenericGetAttr, /* tp_getattro */
13951 0, /* tp_setattro */
13952 0, /* tp_as_buffer */
13953 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13954 0, /* tp_doc */
13955 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13956 0, /* tp_clear */
13957 0, /* tp_richcompare */
13958 0, /* tp_weaklistoffset */
13959 PyObject_SelfIter, /* tp_iter */
13960 (iternextfunc)unicodeiter_next, /* tp_iternext */
13961 unicodeiter_methods, /* tp_methods */
13962 0,
13963 };
13964
13965 static PyObject *
13966 unicode_iter(PyObject *seq)
13967 {
13968 unicodeiterobject *it;
13969
13970 if (!PyUnicode_Check(seq)) {
13971 PyErr_BadInternalCall();
13972 return NULL;
13973 }
13974 if (PyUnicode_READY(seq) == -1)
13975 return NULL;
13976 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13977 if (it == NULL)
13978 return NULL;
13979 it->it_index = 0;
13980 Py_INCREF(seq);
13981 it->it_seq = seq;
13982 _PyObject_GC_TRACK(it);
13983 return (PyObject *)it;
13984 }
13985
13986
13987 size_t
13988 Py_UNICODE_strlen(const Py_UNICODE *u)
13989 {
13990 int res = 0;
13991 while(*u++)
13992 res++;
13993 return res;
13994 }
13995
13996 Py_UNICODE*
13997 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
13998 {
13999 Py_UNICODE *u = s1;
14000 while ((*u++ = *s2++));
14001 return s1;
14002 }
14003
14004 Py_UNICODE*
14005 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14006 {
14007 Py_UNICODE *u = s1;
14008 while ((*u++ = *s2++))
14009 if (n-- == 0)
14010 break;
14011 return s1;
14012 }
14013
14014 Py_UNICODE*
14015 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14016 {
14017 Py_UNICODE *u1 = s1;
14018 u1 += Py_UNICODE_strlen(u1);
14019 Py_UNICODE_strcpy(u1, s2);
14020 return s1;
14021 }
14022
14023 int
14024 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14025 {
14026 while (*s1 && *s2 && *s1 == *s2)
14027 s1++, s2++;
14028 if (*s1 && *s2)
14029 return (*s1 < *s2) ? -1 : +1;
14030 if (*s1)
14031 return 1;
14032 if (*s2)
14033 return -1;
14034 return 0;
14035 }
14036
14037 int
14038 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14039 {
14040 register Py_UNICODE u1, u2;
14041 for (; n != 0; n--) {
14042 u1 = *s1;
14043 u2 = *s2;
14044 if (u1 != u2)
14045 return (u1 < u2) ? -1 : +1;
14046 if (u1 == '\0')
14047 return 0;
14048 s1++;
14049 s2++;
14050 }
14051 return 0;
14052 }
14053
14054 Py_UNICODE*
14055 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14056 {
14057 const Py_UNICODE *p;
14058 for (p = s; *p; p++)
14059 if (*p == c)
14060 return (Py_UNICODE*)p;
14061 return NULL;
14062 }
14063
14064 Py_UNICODE*
14065 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14066 {
14067 const Py_UNICODE *p;
14068 p = s + Py_UNICODE_strlen(s);
14069 while (p != s) {
14070 p--;
14071 if (*p == c)
14072 return (Py_UNICODE*)p;
14073 }
14074 return NULL;
14075 }
14076
14077 Py_UNICODE*
14078 PyUnicode_AsUnicodeCopy(PyObject *unicode)
14079 {
14080 Py_UNICODE *u, *copy;
14081 Py_ssize_t len, size;
14082
14083 if (!PyUnicode_Check(unicode)) {
14084 PyErr_BadArgument();
14085 return NULL;
14086 }
14087 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14088 if (u == NULL)
14089 return NULL;
14090 /* Ensure we won't overflow the size. */
14091 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14092 PyErr_NoMemory();
14093 return NULL;
14094 }
14095 size = len + 1; /* copy the null character */
14096 size *= sizeof(Py_UNICODE);
14097 copy = PyMem_Malloc(size);
14098 if (copy == NULL) {
14099 PyErr_NoMemory();
14100 return NULL;
14101 }
14102 memcpy(copy, u, size);
14103 return copy;
14104 }
14105
14106 /* A _string module, to export formatter_parser and formatter_field_name_split
14107 to the string.Formatter class implemented in Python. */
14108
14109 static PyMethodDef _string_methods[] = {
14110 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14111 METH_O, PyDoc_STR("split the argument as a field name")},
14112 {"formatter_parser", (PyCFunction) formatter_parser,
14113 METH_O, PyDoc_STR("parse the argument as a format string")},
14114 {NULL, NULL}
14115 };
14116
14117 static struct PyModuleDef _string_module = {
14118 PyModuleDef_HEAD_INIT,
14119 "_string",
14120 PyDoc_STR("string helper module"),
14121 0,
14122 _string_methods,
14123 NULL,
14124 NULL,
14125 NULL,
14126 NULL
14127 };
14128
14129 PyMODINIT_FUNC
14130 PyInit__string(void)
14131 {
14132 return PyModule_Create(&_string_module);
14133 }
14134
14135