Ruby 3.4.4p34 (2025-05-14 revision a38531fd3f617bf734ef7d6c595325f69985ea1d)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
41#include "probes.h"
42#include "ruby/encoding.h"
43#include "ruby/re.h"
44#include "ruby/util.h"
45#include "ruby_assert.h"
46#include "vm_sync.h"
47
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
50# include <crypt.h>
51# endif
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
55#endif
56
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
59
60#undef rb_str_new
61#undef rb_usascii_str_new
62#undef rb_utf8_str_new
63#undef rb_enc_str_new
64#undef rb_str_new_cstr
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
72#undef rb_str_buf_cat
73#undef rb_str_buf_cat2
74#undef rb_str_cat2
75#undef rb_str_cat_cstr
76#undef rb_fstring_cstr
77
80
81/* Flags of RString
82 *
83 * 0: STR_SHARED (equal to ELTS_SHARED)
84 * The string is shared. The buffer this string points to is owned by
85 * another string (the shared root).
86 * 1: RSTRING_NOEMBED
87 * The string is not embedded. When a string is embedded, the contents
88 * follow the header. When a string is not embedded, the contents is
89 * on a separately allocated buffer.
90 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
91 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
92 * It emits a deprecation warning when mutated for the first time.
93 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
94 * The string was allocated by the `Symbol#to_s` method.
95 * It emits a deprecation warning when mutated for the first time.
96 * 4: STR_PRECOMPUTED_HASH
97 * The string is embedded and has its precomputed hashcode stored
98 * after the terminator.
99 * 5: STR_SHARED_ROOT
100 * Other strings may point to the contents of this string. When this
101 * flag is set, STR_SHARED must not be set.
102 * 6: STR_BORROWED
103 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
104 * to be unshared by rb_str_tmp_frozen_release.
105 * 7: STR_TMPLOCK
106 * The pointer to the buffer is passed to a system call such as
107 * read(2). Any modification and realloc is prohibited.
108 * 8-9: ENC_CODERANGE
109 * Stores the coderange of the string.
110 * 10-16: ENCODING
111 * Stores the encoding of the string.
112 * 17: RSTRING_FSTR
113 * The string is a fstring. The string is deduplicated in the fstring
114 * table.
115 * 18: STR_NOFREE
116 * Do not free this string's buffer when the string is reclaimed
117 * by the garbage collector. Used for when the string buffer is a C
118 * string literal.
119 * 19: STR_FAKESTR
120 * The string is not allocated or managed by the garbage collector.
121 * Typically, the string object header (struct RString) is temporarily
122 * allocated on C stack.
123 */
124
125#define RUBY_MAX_CHAR_LEN 16
126#define STR_PRECOMPUTED_HASH FL_USER4
127#define STR_SHARED_ROOT FL_USER5
128#define STR_BORROWED FL_USER6
129#define STR_TMPLOCK FL_USER7
130#define STR_NOFREE FL_USER18
131#define STR_FAKESTR FL_USER19
132
133#define STR_SET_NOEMBED(str) do {\
134 FL_SET((str), STR_NOEMBED);\
135 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
136} while (0)
137#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
138
139#define STR_SET_LEN(str, n) do { \
140 RSTRING(str)->len = (n); \
141} while (0)
142
143static inline bool
144str_encindex_fastpath(int encindex)
145{
146 // The overwhelming majority of strings are in one of these 3 encodings.
147 switch (encindex) {
148 case ENCINDEX_ASCII_8BIT:
149 case ENCINDEX_UTF_8:
150 case ENCINDEX_US_ASCII:
151 return true;
152 default:
153 return false;
154 }
155}
156
157static inline bool
158str_enc_fastpath(VALUE str)
159{
160 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
161}
162
163#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
164#define TERM_FILL(ptr, termlen) do {\
165 char *const term_fill_ptr = (ptr);\
166 const int term_fill_len = (termlen);\
167 *term_fill_ptr = '\0';\
168 if (UNLIKELY(term_fill_len > 1))\
169 memset(term_fill_ptr, 0, term_fill_len);\
170} while (0)
171
172#define RESIZE_CAPA(str,capacity) do {\
173 const int termlen = TERM_LEN(str);\
174 RESIZE_CAPA_TERM(str,capacity,termlen);\
175} while (0)
176#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
177 if (STR_EMBED_P(str)) {\
178 if (str_embed_capa(str) < capacity + termlen) {\
179 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
180 const long tlen = RSTRING_LEN(str);\
181 memcpy(tmp, RSTRING_PTR(str), tlen);\
182 RSTRING(str)->as.heap.ptr = tmp;\
183 RSTRING(str)->len = tlen;\
184 STR_SET_NOEMBED(str);\
185 RSTRING(str)->as.heap.aux.capa = (capacity);\
186 }\
187 }\
188 else {\
189 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
190 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
191 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
192 RSTRING(str)->as.heap.aux.capa = (capacity);\
193 }\
194} while (0)
195
196#define STR_SET_SHARED(str, shared_str) do { \
197 if (!FL_TEST(str, STR_FAKESTR)) { \
198 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
199 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
200 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
201 FL_SET((str), STR_SHARED); \
202 FL_SET((shared_str), STR_SHARED_ROOT); \
203 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
204 FL_SET_RAW((shared_str), STR_BORROWED); \
205 } \
206} while (0)
207
208#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
209#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
210/* TODO: include the terminator size in capa. */
211
212#define STR_ENC_GET(str) get_encoding(str)
213
214#if !defined SHARABLE_MIDDLE_SUBSTRING
215# define SHARABLE_MIDDLE_SUBSTRING 0
216#endif
217#if !SHARABLE_MIDDLE_SUBSTRING
218#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
219#else
220#define SHARABLE_SUBSTRING_P(beg, len, end) 1
221#endif
222
223
224static inline long
225str_embed_capa(VALUE str)
226{
227 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
228}
229
230bool
231rb_str_reembeddable_p(VALUE str)
232{
233 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
234}
235
236static inline size_t
237rb_str_embed_size(long capa)
238{
239 return offsetof(struct RString, as.embed.ary) + capa;
240}
241
242size_t
243rb_str_size_as_embedded(VALUE str)
244{
245 size_t real_size;
246 if (STR_EMBED_P(str)) {
247 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
248 }
249 /* if the string is not currently embedded, but it can be embedded, how
250 * much space would it require */
251 else if (rb_str_reembeddable_p(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
253 }
254 else {
255 real_size = sizeof(struct RString);
256 }
257
258 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
259 real_size += sizeof(st_index_t);
260 }
261
262 return real_size;
263}
264
265static inline bool
266STR_EMBEDDABLE_P(long len, long termlen)
267{
268 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
269}
270
271static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
272static VALUE str_new_frozen(VALUE klass, VALUE orig);
273static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
274static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
275static VALUE str_new(VALUE klass, const char *ptr, long len);
276static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
277static inline void str_modifiable(VALUE str);
278static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
279static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
280
281static inline void
282str_make_independent(VALUE str)
283{
284 long len = RSTRING_LEN(str);
285 int termlen = TERM_LEN(str);
286 str_make_independent_expand((str), len, 0L, termlen);
287}
288
289static inline int str_dependent_p(VALUE str);
290
291void
292rb_str_make_independent(VALUE str)
293{
294 if (str_dependent_p(str)) {
295 str_make_independent(str);
296 }
297}
298
299void
300rb_str_make_embedded(VALUE str)
301{
302 RUBY_ASSERT(rb_str_reembeddable_p(str));
303 RUBY_ASSERT(!STR_EMBED_P(str));
304
305 char *buf = RSTRING(str)->as.heap.ptr;
306 long len = RSTRING(str)->len;
307
308 STR_SET_EMBED(str);
309 STR_SET_LEN(str, len);
310
311 if (len > 0) {
312 memcpy(RSTRING_PTR(str), buf, len);
313 ruby_xfree(buf);
314 }
315
316 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
317}
318
319void
320rb_debug_rstring_null_ptr(const char *func)
321{
322 fprintf(stderr, "%s is returning NULL!! "
323 "SIGSEGV is highly expected to follow immediately.\n"
324 "If you could reproduce, attach your debugger here, "
325 "and look at the passed string.\n",
326 func);
327}
328
329/* symbols for [up|down|swap]case/capitalize options */
330static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
331
332static rb_encoding *
333get_encoding(VALUE str)
334{
335 return rb_enc_from_index(ENCODING_GET(str));
336}
337
338static void
339mustnot_broken(VALUE str)
340{
341 if (is_broken_string(str)) {
342 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
343 }
344}
345
346static void
347mustnot_wchar(VALUE str)
348{
349 rb_encoding *enc = STR_ENC_GET(str);
350 if (rb_enc_mbminlen(enc) > 1) {
351 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
352 }
353}
354
355static int fstring_cmp(VALUE a, VALUE b);
356
357static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
358
359#if SIZEOF_LONG == SIZEOF_VOIDP
360#define PRECOMPUTED_FAKESTR_HASH 1
361#else
362#endif
363
364#ifdef PRECOMPUTED_FAKESTR_HASH
365static st_index_t
366fstring_hash(VALUE str)
367{
368 if (FL_TEST_RAW(str, STR_FAKESTR)) {
369 // register_fstring precomputes the hash and stores it in capa for fake strings
370 return (st_index_t)RSTRING(str)->as.heap.aux.capa;
371 }
372 else {
373 return rb_str_hash(str);
374 }
375}
376#else
377#define fstring_hash rb_str_hash
378#endif
379
380const struct st_hash_type rb_fstring_hash_type = {
381 fstring_cmp,
382 fstring_hash,
383};
384
385#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
386
387static inline st_index_t
388str_do_hash(VALUE str)
389{
390 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
391 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
392 if (e && !is_ascii_string(str)) {
393 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
394 }
395 return h;
396}
397
398static VALUE
399str_store_precomputed_hash(VALUE str, st_index_t hash)
400{
401 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
402 RUBY_ASSERT(STR_EMBED_P(str));
403
404#if RUBY_DEBUG
405 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
406 size_t free_bytes = str_embed_capa(str) - used_bytes;
407 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
408#endif
409
410 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
411
412 FL_SET(str, STR_PRECOMPUTED_HASH);
413
414 return str;
415}
416
418 VALUE fstr;
419 bool copy;
420 bool force_precompute_hash;
421};
422
423static int
424fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
425{
426 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
427 VALUE str = (VALUE)*key;
428
429 if (existing) {
430 /* because of lazy sweep, str may be unmarked already and swept
431 * at next time */
432
433 if (rb_objspace_garbage_object_p(str)) {
434 arg->fstr = Qundef;
435 // When RSTRING_FSTR strings are swept, they call `st_delete`.
436 // To avoid a race condition if an equivalent string was inserted
437 // we must remove the flag immediately.
438 FL_UNSET_RAW(str, RSTRING_FSTR);
439 return ST_DELETE;
440 }
441
442 arg->fstr = str;
443 return ST_STOP;
444 }
445 else {
446 // Unless the string is empty or binary, its coderange has been precomputed.
447 int coderange = ENC_CODERANGE(str);
448
449 if (FL_TEST_RAW(str, STR_FAKESTR)) {
450 if (arg->copy) {
451 VALUE new_str;
452 long len = RSTRING_LEN(str);
453 long capa = len + sizeof(st_index_t);
454 int term_len = TERM_LEN(str);
455
456 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
457 new_str = str_alloc_embed(rb_cString, capa + term_len);
458 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
459 STR_SET_LEN(new_str, RSTRING_LEN(str));
460 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
461 rb_enc_copy(new_str, str);
462 str_store_precomputed_hash(new_str, fstring_hash(str));
463 }
464 else {
465 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
466 rb_enc_copy(new_str, str);
467#ifdef PRECOMPUTED_FAKESTR_HASH
468 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
469 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
470 }
471#endif
472 }
473 str = new_str;
474 }
475 else {
476 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
477 RSTRING(str)->len,
478 ENCODING_GET(str));
479 }
480 OBJ_FREEZE(str);
481 }
482 else {
483 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
484 str = str_new_frozen(rb_cString, str);
485 }
486 if (STR_SHARED_P(str)) { /* str should not be shared */
487 /* shared substring */
488 str_make_independent(str);
490 }
491 if (!BARE_STRING_P(str)) {
492 str = str_new_frozen(rb_cString, str);
493 }
494 }
495
496 ENC_CODERANGE_SET(str, coderange);
497 RBASIC(str)->flags |= RSTRING_FSTR;
498
499 *key = *value = arg->fstr = str;
500 return ST_CONTINUE;
501 }
502}
503
504VALUE
505rb_fstring(VALUE str)
506{
507 VALUE fstr;
508 int bare;
509
510 Check_Type(str, T_STRING);
511
512 if (FL_TEST(str, RSTRING_FSTR))
513 return str;
514
515 bare = BARE_STRING_P(str);
516 if (!bare) {
517 if (STR_EMBED_P(str)) {
518 OBJ_FREEZE(str);
519 return str;
520 }
521
522 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
524 return str;
525 }
526 }
527
528 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
529 rb_str_resize(str, RSTRING_LEN(str));
530
531 fstr = register_fstring(str, false, false);
532
533 if (!bare) {
534 str_replace_shared_without_enc(str, fstr);
535 OBJ_FREEZE(str);
536 return str;
537 }
538 return fstr;
539}
540
541static VALUE
542register_fstring(VALUE str, bool copy, bool force_precompute_hash)
543{
544 struct fstr_update_arg args = {
545 .copy = copy,
546 .force_precompute_hash = force_precompute_hash
547 };
548
549#if SIZEOF_VOIDP == SIZEOF_LONG
550 if (FL_TEST_RAW(str, STR_FAKESTR)) {
551 // if the string hasn't been interned, we'll need the hash twice, so we
552 // compute it once and store it in capa
553 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
554 }
555#endif
556
557 RB_VM_LOCK_ENTER();
558 {
559 st_table *frozen_strings = rb_vm_fstring_table();
560 do {
561 args.fstr = str;
562 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
563 } while (UNDEF_P(args.fstr));
564 }
565 RB_VM_LOCK_LEAVE();
566
567 RUBY_ASSERT(OBJ_FROZEN(args.fstr));
568 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
569 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
570 RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
571
572 return args.fstr;
573}
574
575static VALUE
576setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
577{
578 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
579
580 if (!name) {
582 name = "";
583 }
584
585 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
586
587 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
588 fake_str->len = len;
589 fake_str->as.heap.ptr = (char *)name;
590 fake_str->as.heap.aux.capa = len;
591 return (VALUE)fake_str;
592}
593
594/*
595 * set up a fake string which refers a static string literal.
596 */
597VALUE
598rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
599{
600 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
601}
602
603/*
604 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
605 * shared string which refers a static string literal. `ptr` must
606 * point a constant string.
607 */
608VALUE
609rb_fstring_new(const char *ptr, long len)
610{
611 struct RString fake_str;
612 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
613}
614
615VALUE
616rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
617{
618 struct RString fake_str;
619 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
620}
621
622VALUE
623rb_fstring_cstr(const char *ptr)
624{
625 return rb_fstring_new(ptr, strlen(ptr));
626}
627
628static int
629fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
630{
631 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
632 return ST_CONTINUE;
633}
634
635static int
636fstring_cmp(VALUE a, VALUE b)
637{
638 long alen, blen;
639 const char *aptr, *bptr;
640 RSTRING_GETMEM(a, aptr, alen);
641 RSTRING_GETMEM(b, bptr, blen);
642 return (alen != blen ||
643 ENCODING_GET(a) != ENCODING_GET(b) ||
644 memcmp(aptr, bptr, alen) != 0);
645}
646
647static inline bool
648single_byte_optimizable(VALUE str)
649{
650 int encindex = ENCODING_GET(str);
651 switch (encindex) {
652 case ENCINDEX_ASCII_8BIT:
653 case ENCINDEX_US_ASCII:
654 return true;
655 case ENCINDEX_UTF_8:
656 // For UTF-8 it's worth scanning the string coderange when unknown.
658 }
659 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
660 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
661 return true;
662 }
663
664 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
665 return true;
666 }
667
668 /* Conservative. Possibly single byte.
669 * "\xa1" in Shift_JIS for example. */
670 return false;
671}
672
674
675static inline const char *
676search_nonascii(const char *p, const char *e)
677{
678 const uintptr_t *s, *t;
679
680#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
681# if SIZEOF_UINTPTR_T == 8
682# define NONASCII_MASK UINT64_C(0x8080808080808080)
683# elif SIZEOF_UINTPTR_T == 4
684# define NONASCII_MASK UINT32_C(0x80808080)
685# else
686# error "don't know what to do."
687# endif
688#else
689# if SIZEOF_UINTPTR_T == 8
690# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
691# elif SIZEOF_UINTPTR_T == 4
692# define NONASCII_MASK 0x80808080UL /* or...? */
693# else
694# error "don't know what to do."
695# endif
696#endif
697
698 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
699#if !UNALIGNED_WORD_ACCESS
700 if ((uintptr_t)p % SIZEOF_VOIDP) {
701 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
702 p += l;
703 switch (l) {
704 default: UNREACHABLE;
705#if SIZEOF_VOIDP > 4
706 case 7: if (p[-7]&0x80) return p-7;
707 case 6: if (p[-6]&0x80) return p-6;
708 case 5: if (p[-5]&0x80) return p-5;
709 case 4: if (p[-4]&0x80) return p-4;
710#endif
711 case 3: if (p[-3]&0x80) return p-3;
712 case 2: if (p[-2]&0x80) return p-2;
713 case 1: if (p[-1]&0x80) return p-1;
714 case 0: break;
715 }
716 }
717#endif
718#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
719#define aligned_ptr(value) \
720 __builtin_assume_aligned((value), sizeof(uintptr_t))
721#else
722#define aligned_ptr(value) (uintptr_t *)(value)
723#endif
724 s = aligned_ptr(p);
725 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
726#undef aligned_ptr
727 for (;s < t; s++) {
728 if (*s & NONASCII_MASK) {
729#ifdef WORDS_BIGENDIAN
730 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
731#else
732 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
733#endif
734 }
735 }
736 p = (const char *)s;
737 }
738
739 switch (e - p) {
740 default: UNREACHABLE;
741#if SIZEOF_VOIDP > 4
742 case 7: if (e[-7]&0x80) return e-7;
743 case 6: if (e[-6]&0x80) return e-6;
744 case 5: if (e[-5]&0x80) return e-5;
745 case 4: if (e[-4]&0x80) return e-4;
746#endif
747 case 3: if (e[-3]&0x80) return e-3;
748 case 2: if (e[-2]&0x80) return e-2;
749 case 1: if (e[-1]&0x80) return e-1;
750 case 0: return NULL;
751 }
752}
753
754static int
755coderange_scan(const char *p, long len, rb_encoding *enc)
756{
757 const char *e = p + len;
758
759 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
760 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
761 p = search_nonascii(p, e);
763 }
764
765 if (rb_enc_asciicompat(enc)) {
766 p = search_nonascii(p, e);
767 if (!p) return ENC_CODERANGE_7BIT;
768 for (;;) {
769 int ret = rb_enc_precise_mbclen(p, e, enc);
771 p += MBCLEN_CHARFOUND_LEN(ret);
772 if (p == e) break;
773 p = search_nonascii(p, e);
774 if (!p) break;
775 }
776 }
777 else {
778 while (p < e) {
779 int ret = rb_enc_precise_mbclen(p, e, enc);
781 p += MBCLEN_CHARFOUND_LEN(ret);
782 }
783 }
784 return ENC_CODERANGE_VALID;
785}
786
787long
788rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
789{
790 const char *p = s;
791
792 if (*cr == ENC_CODERANGE_BROKEN)
793 return e - s;
794
795 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
796 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
797 if (*cr == ENC_CODERANGE_VALID) return e - s;
798 p = search_nonascii(p, e);
800 return e - s;
801 }
802 else if (rb_enc_asciicompat(enc)) {
803 p = search_nonascii(p, e);
804 if (!p) {
805 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
806 return e - s;
807 }
808 for (;;) {
809 int ret = rb_enc_precise_mbclen(p, e, enc);
810 if (!MBCLEN_CHARFOUND_P(ret)) {
812 return p - s;
813 }
814 p += MBCLEN_CHARFOUND_LEN(ret);
815 if (p == e) break;
816 p = search_nonascii(p, e);
817 if (!p) break;
818 }
819 }
820 else {
821 while (p < e) {
822 int ret = rb_enc_precise_mbclen(p, e, enc);
823 if (!MBCLEN_CHARFOUND_P(ret)) {
825 return p - s;
826 }
827 p += MBCLEN_CHARFOUND_LEN(ret);
828 }
829 }
831 return e - s;
832}
833
834static inline void
835str_enc_copy(VALUE str1, VALUE str2)
836{
837 rb_enc_set_index(str1, ENCODING_GET(str2));
838}
839
840/* Like str_enc_copy, but does not check frozen status of str1.
841 * You should use this only if you're certain that str1 is not frozen. */
842static inline void
843str_enc_copy_direct(VALUE str1, VALUE str2)
844{
845 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
846 if (inlined_encoding == ENCODING_INLINE_MAX) {
847 rb_enc_set_index(str1, rb_enc_get_index(str2));
848 }
849 else {
850 ENCODING_SET_INLINED(str1, inlined_encoding);
851 }
852}
853
854static void
855rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
856{
857 /* this function is designed for copying encoding and coderange
858 * from src to new string "dest" which is made from the part of src.
859 */
860 str_enc_copy(dest, src);
861 if (RSTRING_LEN(dest) == 0) {
862 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
864 else
866 return;
867 }
868 switch (ENC_CODERANGE(src)) {
871 break;
873 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
874 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
876 else
878 break;
879 default:
880 break;
881 }
882}
883
884static void
885rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
886{
887 str_enc_copy(dest, src);
889}
890
891static int
892enc_coderange_scan(VALUE str, rb_encoding *enc)
893{
894 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
895}
896
897int
898rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
899{
900 return enc_coderange_scan(str, enc);
901}
902
903int
905{
906 int cr = ENC_CODERANGE(str);
907
908 if (cr == ENC_CODERANGE_UNKNOWN) {
909 cr = enc_coderange_scan(str, get_encoding(str));
910 ENC_CODERANGE_SET(str, cr);
911 }
912 return cr;
913}
914
915static inline bool
916rb_enc_str_asciicompat(VALUE str)
917{
918 int encindex = ENCODING_GET_INLINED(str);
919 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
920}
921
922int
924{
925 switch(ENC_CODERANGE(str)) {
927 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
929 return true;
930 default:
931 return false;
932 }
933}
934
935static inline void
936str_mod_check(VALUE s, const char *p, long len)
937{
938 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
939 rb_raise(rb_eRuntimeError, "string modified");
940 }
941}
942
943static size_t
944str_capacity(VALUE str, const int termlen)
945{
946 if (STR_EMBED_P(str)) {
947 return str_embed_capa(str) - termlen;
948 }
949 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
950 return RSTRING(str)->len;
951 }
952 else {
953 return RSTRING(str)->as.heap.aux.capa;
954 }
955}
956
957size_t
959{
960 return str_capacity(str, TERM_LEN(str));
961}
962
963static inline void
964must_not_null(const char *ptr)
965{
966 if (!ptr) {
967 rb_raise(rb_eArgError, "NULL pointer given");
968 }
969}
970
971static inline VALUE
972str_alloc_embed(VALUE klass, size_t capa)
973{
974 size_t size = rb_str_embed_size(capa);
975 RUBY_ASSERT(size > 0);
976 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
977
978 NEWOBJ_OF(str, struct RString, klass,
980
981 return (VALUE)str;
982}
983
984static inline VALUE
985str_alloc_heap(VALUE klass)
986{
987 NEWOBJ_OF(str, struct RString, klass,
988 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
989
990 return (VALUE)str;
991}
992
993static inline VALUE
994empty_str_alloc(VALUE klass)
995{
996 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
997 VALUE str = str_alloc_embed(klass, 0);
998 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1000 return str;
1001}
1002
1003static VALUE
1004str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1005{
1006 VALUE str;
1007
1008 if (len < 0) {
1009 rb_raise(rb_eArgError, "negative string size (or size too big)");
1010 }
1011
1012 if (enc == NULL) {
1013 enc = rb_ascii8bit_encoding();
1014 }
1015
1016 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1017
1018 int termlen = rb_enc_mbminlen(enc);
1019
1020 if (STR_EMBEDDABLE_P(len, termlen)) {
1021 str = str_alloc_embed(klass, len + termlen);
1022 if (len == 0) {
1023 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1024 }
1025 }
1026 else {
1027 str = str_alloc_heap(klass);
1028 RSTRING(str)->as.heap.aux.capa = len;
1029 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1030 * integer overflow. If we can STATIC_ASSERT that, the following
1031 * mul_add_mul can be reverted to a simple ALLOC_N. */
1032 RSTRING(str)->as.heap.ptr =
1033 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1034 }
1035
1036 rb_enc_raw_set(str, enc);
1037
1038 if (ptr) {
1039 memcpy(RSTRING_PTR(str), ptr, len);
1040 }
1041
1042 STR_SET_LEN(str, len);
1043 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1044 return str;
1045}
1046
1047static VALUE
1048str_new(VALUE klass, const char *ptr, long len)
1049{
1050 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1051}
1052
1053VALUE
1054rb_str_new(const char *ptr, long len)
1055{
1056 return str_new(rb_cString, ptr, len);
1057}
1058
1059VALUE
1060rb_usascii_str_new(const char *ptr, long len)
1061{
1062 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1063}
1064
1065VALUE
1066rb_utf8_str_new(const char *ptr, long len)
1067{
1068 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1069}
1070
1071VALUE
1072rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1073{
1074 return str_enc_new(rb_cString, ptr, len, enc);
1075}
1076
1077VALUE
1078rb_str_new_cstr(const char *ptr)
1079{
1080 must_not_null(ptr);
1081 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1082 * memory regions, and that cannot be detected by the MSAN. Just
1083 * trust the programmer that the argument passed here is a sane C
1084 * string. */
1085 __msan_unpoison_string(ptr);
1086 return rb_str_new(ptr, strlen(ptr));
1087}
1088
1089VALUE
1091{
1092 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1093}
1094
1095VALUE
1096rb_utf8_str_new_cstr(const char *ptr)
1097{
1098 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1099}
1100
1101VALUE
1102rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
1103{
1104 must_not_null(ptr);
1105 if (rb_enc_mbminlen(enc) != 1) {
1106 rb_raise(rb_eArgError, "wchar encoding given");
1107 }
1108 return rb_enc_str_new(ptr, strlen(ptr), enc);
1109}
1110
1111static VALUE
1112str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1113{
1114 VALUE str;
1115
1116 if (len < 0) {
1117 rb_raise(rb_eArgError, "negative string size (or size too big)");
1118 }
1119
1120 if (!ptr) {
1121 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1122 }
1123 else {
1124 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1125 str = str_alloc_heap(klass);
1126 RSTRING(str)->len = len;
1127 RSTRING(str)->as.heap.ptr = (char *)ptr;
1128 RSTRING(str)->as.heap.aux.capa = len;
1129 RBASIC(str)->flags |= STR_NOFREE;
1130 rb_enc_associate_index(str, encindex);
1131 }
1132 return str;
1133}
1134
1135VALUE
1136rb_str_new_static(const char *ptr, long len)
1137{
1138 return str_new_static(rb_cString, ptr, len, 0);
1139}
1140
1141VALUE
1142rb_usascii_str_new_static(const char *ptr, long len)
1143{
1144 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1145}
1146
1147VALUE
1148rb_utf8_str_new_static(const char *ptr, long len)
1149{
1150 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1151}
1152
1153VALUE
1154rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1155{
1156 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1157}
1158
1159static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1160 rb_encoding *from, rb_encoding *to,
1161 int ecflags, VALUE ecopts);
1162
1163static inline bool
1164is_enc_ascii_string(VALUE str, rb_encoding *enc)
1165{
1166 int encidx = rb_enc_to_index(enc);
1167 if (rb_enc_get_index(str) == encidx)
1168 return is_ascii_string(str);
1169 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1170}
1171
1172VALUE
1173rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1174{
1175 long len;
1176 const char *ptr;
1177 VALUE newstr;
1178
1179 if (!to) return str;
1180 if (!from) from = rb_enc_get(str);
1181 if (from == to) return str;
1182 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1183 rb_is_ascii8bit_enc(to)) {
1184 if (STR_ENC_GET(str) != to) {
1185 str = rb_str_dup(str);
1186 rb_enc_associate(str, to);
1187 }
1188 return str;
1189 }
1190
1191 RSTRING_GETMEM(str, ptr, len);
1192 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1193 from, to, ecflags, ecopts);
1194 if (NIL_P(newstr)) {
1195 /* some error, return original */
1196 return str;
1197 }
1198 return newstr;
1199}
1200
1201VALUE
1202rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1203 rb_encoding *from, int ecflags, VALUE ecopts)
1204{
1205 long olen;
1206
1207 olen = RSTRING_LEN(newstr);
1208 if (ofs < -olen || olen < ofs)
1209 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1210 if (ofs < 0) ofs += olen;
1211 if (!from) {
1212 STR_SET_LEN(newstr, ofs);
1213 return rb_str_cat(newstr, ptr, len);
1214 }
1215
1216 rb_str_modify(newstr);
1217 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1218 rb_enc_get(newstr),
1219 ecflags, ecopts);
1220}
1221
1222VALUE
1223rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1224{
1225 STR_SET_LEN(str, 0);
1226 rb_enc_associate(str, enc);
1227 rb_str_cat(str, ptr, len);
1228 return str;
1229}
1230
1231static VALUE
1232str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1233 rb_encoding *from, rb_encoding *to,
1234 int ecflags, VALUE ecopts)
1235{
1236 rb_econv_t *ec;
1238 long olen;
1239 VALUE econv_wrapper;
1240 const unsigned char *start, *sp;
1241 unsigned char *dest, *dp;
1242 size_t converted_output = (size_t)ofs;
1243
1244 olen = rb_str_capacity(newstr);
1245
1246 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1247 RBASIC_CLEAR_CLASS(econv_wrapper);
1248 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1249 if (!ec) return Qnil;
1250 DATA_PTR(econv_wrapper) = ec;
1251
1252 sp = (unsigned char*)ptr;
1253 start = sp;
1254 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1255 (dp = dest + converted_output),
1256 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1258 /* destination buffer short */
1259 size_t converted_input = sp - start;
1260 size_t rest = len - converted_input;
1261 converted_output = dp - dest;
1262 rb_str_set_len(newstr, converted_output);
1263 if (converted_input && converted_output &&
1264 rest < (LONG_MAX / converted_output)) {
1265 rest = (rest * converted_output) / converted_input;
1266 }
1267 else {
1268 rest = olen;
1269 }
1270 olen += rest < 2 ? 2 : rest;
1271 rb_str_resize(newstr, olen);
1272 }
1273 DATA_PTR(econv_wrapper) = 0;
1274 RB_GC_GUARD(econv_wrapper);
1275 rb_econv_close(ec);
1276 switch (ret) {
1277 case econv_finished:
1278 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1279 rb_str_set_len(newstr, len);
1280 rb_enc_associate(newstr, to);
1281 return newstr;
1282
1283 default:
1284 return Qnil;
1285 }
1286}
1287
1288VALUE
1289rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1290{
1291 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1292}
1293
1294VALUE
1295rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1296{
1297 rb_encoding *ienc;
1298 VALUE str;
1299 const int eidx = rb_enc_to_index(eenc);
1300
1301 if (!ptr) {
1302 return rb_enc_str_new(ptr, len, eenc);
1303 }
1304
1305 /* ASCII-8BIT case, no conversion */
1306 if ((eidx == rb_ascii8bit_encindex()) ||
1307 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1308 return rb_str_new(ptr, len);
1309 }
1310 /* no default_internal or same encoding, no conversion */
1311 ienc = rb_default_internal_encoding();
1312 if (!ienc || eenc == ienc) {
1313 return rb_enc_str_new(ptr, len, eenc);
1314 }
1315 /* ASCII compatible, and ASCII only string, no conversion in
1316 * default_internal */
1317 if ((eidx == rb_ascii8bit_encindex()) ||
1318 (eidx == rb_usascii_encindex()) ||
1319 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1320 return rb_enc_str_new(ptr, len, ienc);
1321 }
1322 /* convert from the given encoding to default_internal */
1323 str = rb_enc_str_new(NULL, 0, ienc);
1324 /* when the conversion failed for some reason, just ignore the
1325 * default_internal and result in the given encoding as-is. */
1326 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1327 rb_str_initialize(str, ptr, len, eenc);
1328 }
1329 return str;
1330}
1331
1332VALUE
1333rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1334{
1335 int eidx = rb_enc_to_index(eenc);
1336 if (eidx == rb_usascii_encindex() &&
1337 !is_ascii_string(str)) {
1338 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1339 return str;
1340 }
1341 rb_enc_associate_index(str, eidx);
1342 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1343}
1344
1345VALUE
1346rb_external_str_new(const char *ptr, long len)
1347{
1348 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1349}
1350
1351VALUE
1353{
1354 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1355}
1356
1357VALUE
1358rb_locale_str_new(const char *ptr, long len)
1359{
1360 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1361}
1362
1363VALUE
1365{
1366 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1367}
1368
1369VALUE
1370rb_filesystem_str_new(const char *ptr, long len)
1371{
1372 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1373}
1374
1375VALUE
1377{
1378 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1379}
1380
1381VALUE
1383{
1384 return rb_str_export_to_enc(str, rb_default_external_encoding());
1385}
1386
1387VALUE
1389{
1390 return rb_str_export_to_enc(str, rb_locale_encoding());
1391}
1392
1393VALUE
1394rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1395{
1396 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1397}
1398
1399static VALUE
1400str_replace_shared_without_enc(VALUE str2, VALUE str)
1401{
1402 const int termlen = TERM_LEN(str);
1403 char *ptr;
1404 long len;
1405
1406 RSTRING_GETMEM(str, ptr, len);
1407 if (str_embed_capa(str2) >= len + termlen) {
1408 char *ptr2 = RSTRING(str2)->as.embed.ary;
1409 STR_SET_EMBED(str2);
1410 memcpy(ptr2, RSTRING_PTR(str), len);
1411 TERM_FILL(ptr2+len, termlen);
1412 }
1413 else {
1414 VALUE root;
1415 if (STR_SHARED_P(str)) {
1416 root = RSTRING(str)->as.heap.aux.shared;
1417 RSTRING_GETMEM(str, ptr, len);
1418 }
1419 else {
1420 root = rb_str_new_frozen(str);
1421 RSTRING_GETMEM(root, ptr, len);
1422 }
1423 RUBY_ASSERT(OBJ_FROZEN(root));
1424
1425 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1426 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1427 rb_fatal("about to free a possible shared root");
1428 }
1429 char *ptr2 = STR_HEAP_PTR(str2);
1430 if (ptr2 != ptr) {
1431 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1432 }
1433 }
1434 FL_SET(str2, STR_NOEMBED);
1435 RSTRING(str2)->as.heap.ptr = ptr;
1436 STR_SET_SHARED(str2, root);
1437 }
1438
1439 STR_SET_LEN(str2, len);
1440
1441 return str2;
1442}
1443
1444static VALUE
1445str_replace_shared(VALUE str2, VALUE str)
1446{
1447 str_replace_shared_without_enc(str2, str);
1448 rb_enc_cr_str_exact_copy(str2, str);
1449 return str2;
1450}
1451
1452static VALUE
1453str_new_shared(VALUE klass, VALUE str)
1454{
1455 return str_replace_shared(str_alloc_heap(klass), str);
1456}
1457
1458VALUE
1460{
1461 return str_new_shared(rb_obj_class(str), str);
1462}
1463
1464VALUE
1466{
1467 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1468 return str_new_frozen(rb_obj_class(orig), orig);
1469}
1470
1471static VALUE
1472rb_str_new_frozen_String(VALUE orig)
1473{
1474 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1475 return str_new_frozen(rb_cString, orig);
1476}
1477
1478VALUE
1479rb_str_tmp_frozen_acquire(VALUE orig)
1480{
1481 if (OBJ_FROZEN_RAW(orig)) return orig;
1482 return str_new_frozen_buffer(0, orig, FALSE);
1483}
1484
1485VALUE
1486rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1487{
1488 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1489 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1490
1491 VALUE str = str_alloc_heap(0);
1492 OBJ_FREEZE(str);
1493 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1494 FL_SET(str, STR_SHARED_ROOT);
1495
1496 size_t capa = str_capacity(orig, TERM_LEN(orig));
1497
1498 /* If the string is embedded then we want to create a copy that is heap
1499 * allocated. If the string is shared then the shared root must be
1500 * embedded, so we want to create a copy. If the string is a shared root
1501 * then it must be embedded, so we want to create a copy. */
1502 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1503 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1504 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1505 }
1506 else {
1507 /* orig must be heap allocated and not shared, so we can safely transfer
1508 * the pointer to str. */
1509 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1510 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1511 RBASIC(orig)->flags &= ~STR_NOFREE;
1512 STR_SET_SHARED(orig, str);
1513 }
1514
1515 RSTRING(str)->len = RSTRING(orig)->len;
1516 RSTRING(str)->as.heap.aux.capa = capa;
1517
1518 return str;
1519}
1520
1521void
1522rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1523{
1524 if (RBASIC_CLASS(tmp) != 0)
1525 return;
1526
1527 if (STR_EMBED_P(tmp)) {
1529 }
1530 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1531 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1532 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1533
1534 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1535 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1536 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1537
1538 /* Unshare orig since the root (tmp) only has this one child. */
1539 FL_UNSET_RAW(orig, STR_SHARED);
1540 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1541 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1543
1544 /* Make tmp embedded and empty so it is safe for sweeping. */
1545 STR_SET_EMBED(tmp);
1546 STR_SET_LEN(tmp, 0);
1547 }
1548 }
1549}
1550
1551static VALUE
1552str_new_frozen(VALUE klass, VALUE orig)
1553{
1554 return str_new_frozen_buffer(klass, orig, TRUE);
1555}
1556
1557static VALUE
1558heap_str_make_shared(VALUE klass, VALUE orig)
1559{
1560 RUBY_ASSERT(!STR_EMBED_P(orig));
1561 RUBY_ASSERT(!STR_SHARED_P(orig));
1562
1563 VALUE str = str_alloc_heap(klass);
1564 STR_SET_LEN(str, RSTRING_LEN(orig));
1565 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1566 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1567 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1568 RBASIC(orig)->flags &= ~STR_NOFREE;
1569 STR_SET_SHARED(orig, str);
1570 if (klass == 0)
1571 FL_UNSET_RAW(str, STR_BORROWED);
1572 return str;
1573}
1574
1575static VALUE
1576str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1577{
1578 VALUE str;
1579
1580 long len = RSTRING_LEN(orig);
1581 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1582 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1583
1584 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1585 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1586 RUBY_ASSERT(STR_EMBED_P(str));
1587 }
1588 else {
1589 if (FL_TEST_RAW(orig, STR_SHARED)) {
1590 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1591 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1592 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1593 RUBY_ASSERT(ofs >= 0);
1594 RUBY_ASSERT(rest >= 0);
1595 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1597
1598 if ((ofs > 0) || (rest > 0) ||
1599 (klass != RBASIC(shared)->klass) ||
1600 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1601 str = str_new_shared(klass, shared);
1602 RUBY_ASSERT(!STR_EMBED_P(str));
1603 RSTRING(str)->as.heap.ptr += ofs;
1604 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1605 }
1606 else {
1607 if (RBASIC_CLASS(shared) == 0)
1608 FL_SET_RAW(shared, STR_BORROWED);
1609 return shared;
1610 }
1611 }
1612 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1613 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1614 STR_SET_EMBED(str);
1615 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1616 STR_SET_LEN(str, RSTRING_LEN(orig));
1617 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1618 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1619 }
1620 else {
1621 str = heap_str_make_shared(klass, orig);
1622 }
1623 }
1624
1625 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1626 OBJ_FREEZE(str);
1627 return str;
1628}
1629
1630VALUE
1631rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1632{
1633 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1634}
1635
1636static VALUE
1637str_new_empty_String(VALUE str)
1638{
1639 VALUE v = rb_str_new(0, 0);
1640 rb_enc_copy(v, str);
1641 return v;
1642}
1643
1644#define STR_BUF_MIN_SIZE 63
1645
1646VALUE
1648{
1649 if (STR_EMBEDDABLE_P(capa, 1)) {
1650 return str_alloc_embed(rb_cString, capa + 1);
1651 }
1652
1653 VALUE str = str_alloc_heap(rb_cString);
1654
1655 RSTRING(str)->as.heap.aux.capa = capa;
1656 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1657 RSTRING(str)->as.heap.ptr[0] = '\0';
1658
1659 return str;
1660}
1661
1662VALUE
1663rb_str_buf_new_cstr(const char *ptr)
1664{
1665 VALUE str;
1666 long len = strlen(ptr);
1667
1668 str = rb_str_buf_new(len);
1669 rb_str_buf_cat(str, ptr, len);
1670
1671 return str;
1672}
1673
1674VALUE
1676{
1677 return str_new(0, 0, len);
1678}
1679
1680void
1682{
1683 if (STR_EMBED_P(str)) {
1684 RB_DEBUG_COUNTER_INC(obj_str_embed);
1685 }
1686 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1687 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1688 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1689 }
1690 else {
1691 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1692 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1693 }
1694}
1695
1696size_t
1697rb_str_memsize(VALUE str)
1698{
1699 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1700 return STR_HEAP_SIZE(str);
1701 }
1702 else {
1703 return 0;
1704 }
1705}
1706
1707VALUE
1709{
1710 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1711}
1712
1713static inline void str_discard(VALUE str);
1714static void str_shared_replace(VALUE str, VALUE str2);
1715
1716void
1718{
1719 if (str != str2) str_shared_replace(str, str2);
1720}
1721
1722static void
1723str_shared_replace(VALUE str, VALUE str2)
1724{
1725 rb_encoding *enc;
1726 int cr;
1727 int termlen;
1728
1729 RUBY_ASSERT(str2 != str);
1730 enc = STR_ENC_GET(str2);
1731 cr = ENC_CODERANGE(str2);
1732 str_discard(str);
1733 termlen = rb_enc_mbminlen(enc);
1734
1735 STR_SET_LEN(str, RSTRING_LEN(str2));
1736
1737 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1738 STR_SET_EMBED(str);
1739 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1740 rb_enc_associate(str, enc);
1741 ENC_CODERANGE_SET(str, cr);
1742 }
1743 else {
1744 if (STR_EMBED_P(str2)) {
1745 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1746 long len = RSTRING_LEN(str2);
1747 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1748
1749 char *new_ptr = ALLOC_N(char, len + termlen);
1750 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1751 RSTRING(str2)->as.heap.ptr = new_ptr;
1752 STR_SET_LEN(str2, len);
1753 RSTRING(str2)->as.heap.aux.capa = len;
1754 STR_SET_NOEMBED(str2);
1755 }
1756
1757 STR_SET_NOEMBED(str);
1758 FL_UNSET(str, STR_SHARED);
1759 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1760
1761 if (FL_TEST(str2, STR_SHARED)) {
1762 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1763 STR_SET_SHARED(str, shared);
1764 }
1765 else {
1766 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1767 }
1768
1769 /* abandon str2 */
1770 STR_SET_EMBED(str2);
1771 RSTRING_PTR(str2)[0] = 0;
1772 STR_SET_LEN(str2, 0);
1773 rb_enc_associate(str, enc);
1774 ENC_CODERANGE_SET(str, cr);
1775 }
1776}
1777
1778VALUE
1779rb_obj_as_string(VALUE obj)
1780{
1781 VALUE str;
1782
1783 if (RB_TYPE_P(obj, T_STRING)) {
1784 return obj;
1785 }
1786 str = rb_funcall(obj, idTo_s, 0);
1787 return rb_obj_as_string_result(str, obj);
1788}
1789
1790VALUE
1791rb_obj_as_string_result(VALUE str, VALUE obj)
1792{
1793 if (!RB_TYPE_P(str, T_STRING))
1794 return rb_any_to_s(obj);
1795 return str;
1796}
1797
1798static VALUE
1799str_replace(VALUE str, VALUE str2)
1800{
1801 long len;
1802
1803 len = RSTRING_LEN(str2);
1804 if (STR_SHARED_P(str2)) {
1805 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1807 STR_SET_NOEMBED(str);
1808 STR_SET_LEN(str, len);
1809 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1810 STR_SET_SHARED(str, shared);
1811 rb_enc_cr_str_exact_copy(str, str2);
1812 }
1813 else {
1814 str_replace_shared(str, str2);
1815 }
1816
1817 return str;
1818}
1819
1820static inline VALUE
1821ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1822{
1823 size_t size = rb_str_embed_size(capa);
1824 RUBY_ASSERT(size > 0);
1825 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1826
1827 NEWOBJ_OF(str, struct RString, klass,
1829
1830 return (VALUE)str;
1831}
1832
1833static inline VALUE
1834ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1835{
1836 NEWOBJ_OF(str, struct RString, klass,
1837 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1838
1839 return (VALUE)str;
1840}
1841
1842static inline VALUE
1843str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1844{
1845 int encidx = 0;
1846 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1847 encidx = rb_enc_get_index(str);
1848 flags &= ~ENCODING_MASK;
1849 }
1850 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1851 if (encidx) rb_enc_associate_index(dup, encidx);
1852 return dup;
1853}
1854
1855static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1856
1857static inline VALUE
1858str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1859{
1860 VALUE flags = FL_TEST_RAW(str, flag_mask);
1861 long len = RSTRING_LEN(str);
1862
1863 RUBY_ASSERT(STR_EMBED_P(dup));
1864 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1865 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1866 STR_SET_LEN(dup, RSTRING_LEN(str));
1867 return str_duplicate_setup_encoding(str, dup, flags);
1868}
1869
1870static inline VALUE
1871str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1872{
1873 VALUE flags = FL_TEST_RAW(str, flag_mask);
1874 VALUE root = str;
1875 if (FL_TEST_RAW(str, STR_SHARED)) {
1876 root = RSTRING(str)->as.heap.aux.shared;
1877 }
1878 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1879 root = str = str_new_frozen(klass, str);
1880 flags = FL_TEST_RAW(str, flag_mask);
1881 }
1882 RUBY_ASSERT(!STR_SHARED_P(root));
1884
1885 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1886 FL_SET(root, STR_SHARED_ROOT);
1887 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1888 flags |= RSTRING_NOEMBED | STR_SHARED;
1889
1890 STR_SET_LEN(dup, RSTRING_LEN(str));
1891 return str_duplicate_setup_encoding(str, dup, flags);
1892}
1893
1894static inline VALUE
1895str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1896{
1897 if (STR_EMBED_P(str)) {
1898 return str_duplicate_setup_embed(klass, str, dup);
1899 }
1900 else {
1901 return str_duplicate_setup_heap(klass, str, dup);
1902 }
1903}
1904
1905static inline VALUE
1906str_duplicate(VALUE klass, VALUE str)
1907{
1908 VALUE dup;
1909 if (STR_EMBED_P(str)) {
1910 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1911 }
1912 else {
1913 dup = str_alloc_heap(klass);
1914 }
1915
1916 return str_duplicate_setup(klass, str, dup);
1917}
1918
1919VALUE
1921{
1922 return str_duplicate(rb_obj_class(str), str);
1923}
1924
1925/* :nodoc: */
1926VALUE
1927rb_str_dup_m(VALUE str)
1928{
1929 if (LIKELY(BARE_STRING_P(str))) {
1930 return str_duplicate(rb_obj_class(str), str);
1931 }
1932 else {
1933 return rb_obj_dup(str);
1934 }
1935}
1936
1937VALUE
1939{
1940 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1941 return str_duplicate(rb_cString, str);
1942}
1943
1944VALUE
1945rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1946{
1947 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1948 VALUE new_str, klass = rb_cString;
1949
1950 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1951 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1952 str_duplicate_setup_embed(klass, str, new_str);
1953 }
1954 else {
1955 new_str = ec_str_alloc_heap(ec, klass);
1956 str_duplicate_setup_heap(klass, str, new_str);
1957 }
1958 if (chilled) {
1959 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1960 }
1961 return new_str;
1962}
1963
1964VALUE
1965rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1966{
1967 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1968 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1969 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1970 FL_SET_RAW(str, STR_CHILLED_LITERAL);
1971 return rb_str_freeze(str);
1972}
1973
1974/*
1975 *
1976 * call-seq:
1977 * String.new(string = '', **opts) -> new_string
1978 *
1979 * :include: doc/string/new.rdoc
1980 *
1981 */
1982
1983static VALUE
1984rb_str_init(int argc, VALUE *argv, VALUE str)
1985{
1986 static ID keyword_ids[2];
1987 VALUE orig, opt, venc, vcapa;
1988 VALUE kwargs[2];
1989 rb_encoding *enc = 0;
1990 int n;
1991
1992 if (!keyword_ids[0]) {
1993 keyword_ids[0] = rb_id_encoding();
1994 CONST_ID(keyword_ids[1], "capacity");
1995 }
1996
1997 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1998 if (!NIL_P(opt)) {
1999 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2000 venc = kwargs[0];
2001 vcapa = kwargs[1];
2002 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2003 enc = rb_to_encoding(venc);
2004 }
2005 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2006 long capa = NUM2LONG(vcapa);
2007 long len = 0;
2008 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2009
2010 if (capa < STR_BUF_MIN_SIZE) {
2011 capa = STR_BUF_MIN_SIZE;
2012 }
2013 if (n == 1) {
2014 StringValue(orig);
2015 len = RSTRING_LEN(orig);
2016 if (capa < len) {
2017 capa = len;
2018 }
2019 if (orig == str) n = 0;
2020 }
2021 str_modifiable(str);
2022 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2023 /* make noembed always */
2024 const size_t size = (size_t)capa + termlen;
2025 const char *const old_ptr = RSTRING_PTR(str);
2026 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2027 char *new_ptr = ALLOC_N(char, size);
2028 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2029 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2030 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2031 RSTRING(str)->as.heap.ptr = new_ptr;
2032 }
2033 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2034 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2035 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2036 }
2037 STR_SET_LEN(str, len);
2038 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2039 if (n == 1) {
2040 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2041 rb_enc_cr_str_exact_copy(str, orig);
2042 }
2043 FL_SET(str, STR_NOEMBED);
2044 RSTRING(str)->as.heap.aux.capa = capa;
2045 }
2046 else if (n == 1) {
2047 rb_str_replace(str, orig);
2048 }
2049 if (enc) {
2050 rb_enc_associate(str, enc);
2052 }
2053 }
2054 else if (n == 1) {
2055 rb_str_replace(str, orig);
2056 }
2057 return str;
2058}
2059
2060/* :nodoc: */
2061static VALUE
2062rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2063{
2064 if (klass != rb_cString) {
2065 return rb_class_new_instance_pass_kw(argc, argv, klass);
2066 }
2067
2068 static ID keyword_ids[2];
2069 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2070 VALUE kwargs[2];
2071 rb_encoding *enc = NULL;
2072
2073 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2074 if (NIL_P(opt)) {
2075 return rb_class_new_instance_pass_kw(argc, argv, klass);
2076 }
2077
2078 keyword_ids[0] = rb_id_encoding();
2079 CONST_ID(keyword_ids[1], "capacity");
2080 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2081 encoding = kwargs[0];
2082 capacity = kwargs[1];
2083
2084 if (n == 1) {
2085 orig = StringValue(orig);
2086 }
2087 else {
2088 orig = Qnil;
2089 }
2090
2091 if (UNDEF_P(encoding)) {
2092 if (!NIL_P(orig)) {
2093 encoding = rb_obj_encoding(orig);
2094 }
2095 }
2096
2097 if (!UNDEF_P(encoding)) {
2098 enc = rb_to_encoding(encoding);
2099 }
2100
2101 // If capacity is nil, we're basically just duping `orig`.
2102 if (UNDEF_P(capacity)) {
2103 if (NIL_P(orig)) {
2104 VALUE empty_str = str_new(klass, "", 0);
2105 if (enc) {
2106 rb_enc_associate(empty_str, enc);
2107 }
2108 return empty_str;
2109 }
2110 VALUE copy = str_duplicate(klass, orig);
2111 rb_enc_associate(copy, enc);
2112 ENC_CODERANGE_CLEAR(copy);
2113 return copy;
2114 }
2115
2116 long capa = 0;
2117 capa = NUM2LONG(capacity);
2118 if (capa < 0) {
2119 capa = 0;
2120 }
2121
2122 if (!NIL_P(orig)) {
2123 long orig_capa = rb_str_capacity(orig);
2124 if (orig_capa > capa) {
2125 capa = orig_capa;
2126 }
2127 }
2128
2129 VALUE str = str_enc_new(klass, NULL, capa, enc);
2130 STR_SET_LEN(str, 0);
2131 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2132
2133 if (!NIL_P(orig)) {
2134 rb_str_buf_append(str, orig);
2135 }
2136
2137 return str;
2138}
2139
2140#ifdef NONASCII_MASK
2141#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2142
2143/*
2144 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2145 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2146 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2147 *
2148 * if (!(byte & 0x80))
2149 * byte |= 0x40; // turn on bit6
2150 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2151 *
2152 * This function calculates whether a byte is leading or not for all bytes
2153 * in the argument word by concurrently using the above logic, and then
2154 * adds up the number of leading bytes in the word.
2155 */
2156static inline uintptr_t
2157count_utf8_lead_bytes_with_word(const uintptr_t *s)
2158{
2159 uintptr_t d = *s;
2160
2161 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2162 d = (d>>6) | (~d>>7);
2163 d &= NONASCII_MASK >> 7;
2164
2165 /* Gather all bytes. */
2166#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2167 /* use only if it can use POPCNT */
2168 return rb_popcount_intptr(d);
2169#else
2170 d += (d>>8);
2171 d += (d>>16);
2172# if SIZEOF_VOIDP == 8
2173 d += (d>>32);
2174# endif
2175 return (d&0xF);
2176#endif
2177}
2178#endif
2179
2180static inline long
2181enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2182{
2183 long c;
2184 const char *q;
2185
2186 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2187 long diff = (long)(e - p);
2188 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2189 }
2190#ifdef NONASCII_MASK
2191 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2192 uintptr_t len = 0;
2193 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2194 const uintptr_t *s, *t;
2195 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2196 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2197 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2198 while (p < (const char *)s) {
2199 if (is_utf8_lead_byte(*p)) len++;
2200 p++;
2201 }
2202 while (s < t) {
2203 len += count_utf8_lead_bytes_with_word(s);
2204 s++;
2205 }
2206 p = (const char *)s;
2207 }
2208 while (p < e) {
2209 if (is_utf8_lead_byte(*p)) len++;
2210 p++;
2211 }
2212 return (long)len;
2213 }
2214#endif
2215 else if (rb_enc_asciicompat(enc)) {
2216 c = 0;
2217 if (ENC_CODERANGE_CLEAN_P(cr)) {
2218 while (p < e) {
2219 if (ISASCII(*p)) {
2220 q = search_nonascii(p, e);
2221 if (!q)
2222 return c + (e - p);
2223 c += q - p;
2224 p = q;
2225 }
2226 p += rb_enc_fast_mbclen(p, e, enc);
2227 c++;
2228 }
2229 }
2230 else {
2231 while (p < e) {
2232 if (ISASCII(*p)) {
2233 q = search_nonascii(p, e);
2234 if (!q)
2235 return c + (e - p);
2236 c += q - p;
2237 p = q;
2238 }
2239 p += rb_enc_mbclen(p, e, enc);
2240 c++;
2241 }
2242 }
2243 return c;
2244 }
2245
2246 for (c=0; p<e; c++) {
2247 p += rb_enc_mbclen(p, e, enc);
2248 }
2249 return c;
2250}
2251
2252long
2253rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2254{
2255 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2256}
2257
2258/* To get strlen with cr
2259 * Note that given cr is not used.
2260 */
2261long
2262rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2263{
2264 long c;
2265 const char *q;
2266 int ret;
2267
2268 *cr = 0;
2269 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2270 long diff = (long)(e - p);
2271 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2272 }
2273 else if (rb_enc_asciicompat(enc)) {
2274 c = 0;
2275 while (p < e) {
2276 if (ISASCII(*p)) {
2277 q = search_nonascii(p, e);
2278 if (!q) {
2279 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2280 return c + (e - p);
2281 }
2282 c += q - p;
2283 p = q;
2284 }
2285 ret = rb_enc_precise_mbclen(p, e, enc);
2286 if (MBCLEN_CHARFOUND_P(ret)) {
2287 *cr |= ENC_CODERANGE_VALID;
2288 p += MBCLEN_CHARFOUND_LEN(ret);
2289 }
2290 else {
2292 p++;
2293 }
2294 c++;
2295 }
2296 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2297 return c;
2298 }
2299
2300 for (c=0; p<e; c++) {
2301 ret = rb_enc_precise_mbclen(p, e, enc);
2302 if (MBCLEN_CHARFOUND_P(ret)) {
2303 *cr |= ENC_CODERANGE_VALID;
2304 p += MBCLEN_CHARFOUND_LEN(ret);
2305 }
2306 else {
2308 if (p + rb_enc_mbminlen(enc) <= e)
2309 p += rb_enc_mbminlen(enc);
2310 else
2311 p = e;
2312 }
2313 }
2314 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2315 return c;
2316}
2317
2318/* enc must be str's enc or rb_enc_check(str, str2) */
2319static long
2320str_strlen(VALUE str, rb_encoding *enc)
2321{
2322 const char *p, *e;
2323 int cr;
2324
2325 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2326 if (!enc) enc = STR_ENC_GET(str);
2327 p = RSTRING_PTR(str);
2328 e = RSTRING_END(str);
2329 cr = ENC_CODERANGE(str);
2330
2331 if (cr == ENC_CODERANGE_UNKNOWN) {
2332 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2333 if (cr) ENC_CODERANGE_SET(str, cr);
2334 return n;
2335 }
2336 else {
2337 return enc_strlen(p, e, enc, cr);
2338 }
2339}
2340
2341long
2343{
2344 return str_strlen(str, NULL);
2345}
2346
2347/*
2348 * call-seq:
2349 * length -> integer
2350 *
2351 * :include: doc/string/length.rdoc
2352 *
2353 */
2354
2355VALUE
2357{
2358 return LONG2NUM(str_strlen(str, NULL));
2359}
2360
2361/*
2362 * call-seq:
2363 * bytesize -> integer
2364 *
2365 * :include: doc/string/bytesize.rdoc
2366 *
2367 */
2368
2369VALUE
2370rb_str_bytesize(VALUE str)
2371{
2372 return LONG2NUM(RSTRING_LEN(str));
2373}
2374
2375/*
2376 * call-seq:
2377 * empty? -> true or false
2378 *
2379 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2380 *
2381 * "hello".empty? # => false
2382 * " ".empty? # => false
2383 * "".empty? # => true
2384 *
2385 */
2386
2387static VALUE
2388rb_str_empty(VALUE str)
2389{
2390 return RBOOL(RSTRING_LEN(str) == 0);
2391}
2392
2393/*
2394 * call-seq:
2395 * string + other_string -> new_string
2396 *
2397 * Returns a new +String+ containing +other_string+ concatenated to +self+:
2398 *
2399 * "Hello from " + self.to_s # => "Hello from main"
2400 *
2401 */
2402
2403VALUE
2405{
2406 VALUE str3;
2407 rb_encoding *enc;
2408 char *ptr1, *ptr2, *ptr3;
2409 long len1, len2;
2410 int termlen;
2411
2412 StringValue(str2);
2413 enc = rb_enc_check_str(str1, str2);
2414 RSTRING_GETMEM(str1, ptr1, len1);
2415 RSTRING_GETMEM(str2, ptr2, len2);
2416 termlen = rb_enc_mbminlen(enc);
2417 if (len1 > LONG_MAX - len2) {
2418 rb_raise(rb_eArgError, "string size too big");
2419 }
2420 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2421 ptr3 = RSTRING_PTR(str3);
2422 memcpy(ptr3, ptr1, len1);
2423 memcpy(ptr3+len1, ptr2, len2);
2424 TERM_FILL(&ptr3[len1+len2], termlen);
2425
2426 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2428 RB_GC_GUARD(str1);
2429 RB_GC_GUARD(str2);
2430 return str3;
2431}
2432
2433/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2434VALUE
2435rb_str_opt_plus(VALUE str1, VALUE str2)
2436{
2439 long len1, len2;
2440 MAYBE_UNUSED(char) *ptr1, *ptr2;
2441 RSTRING_GETMEM(str1, ptr1, len1);
2442 RSTRING_GETMEM(str2, ptr2, len2);
2443 int enc1 = rb_enc_get_index(str1);
2444 int enc2 = rb_enc_get_index(str2);
2445
2446 if (enc1 < 0) {
2447 return Qundef;
2448 }
2449 else if (enc2 < 0) {
2450 return Qundef;
2451 }
2452 else if (enc1 != enc2) {
2453 return Qundef;
2454 }
2455 else if (len1 > LONG_MAX - len2) {
2456 return Qundef;
2457 }
2458 else {
2459 return rb_str_plus(str1, str2);
2460 }
2461
2462}
2463
2464/*
2465 * call-seq:
2466 * string * integer -> new_string
2467 *
2468 * Returns a new +String+ containing +integer+ copies of +self+:
2469 *
2470 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2471 * "Ho! " * 0 # => ""
2472 *
2473 */
2474
2475VALUE
2477{
2478 VALUE str2;
2479 long n, len;
2480 char *ptr2;
2481 int termlen;
2482
2483 if (times == INT2FIX(1)) {
2484 return str_duplicate(rb_cString, str);
2485 }
2486 if (times == INT2FIX(0)) {
2487 str2 = str_alloc_embed(rb_cString, 0);
2488 rb_enc_copy(str2, str);
2489 return str2;
2490 }
2491 len = NUM2LONG(times);
2492 if (len < 0) {
2493 rb_raise(rb_eArgError, "negative argument");
2494 }
2495 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2496 if (STR_EMBEDDABLE_P(len, 1)) {
2497 str2 = str_alloc_embed(rb_cString, len + 1);
2498 memset(RSTRING_PTR(str2), 0, len + 1);
2499 }
2500 else {
2501 str2 = str_alloc_heap(rb_cString);
2502 RSTRING(str2)->as.heap.aux.capa = len;
2503 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2504 }
2505 STR_SET_LEN(str2, len);
2506 rb_enc_copy(str2, str);
2507 return str2;
2508 }
2509 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2510 rb_raise(rb_eArgError, "argument too big");
2511 }
2512
2513 len *= RSTRING_LEN(str);
2514 termlen = TERM_LEN(str);
2515 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2516 ptr2 = RSTRING_PTR(str2);
2517 if (len) {
2518 n = RSTRING_LEN(str);
2519 memcpy(ptr2, RSTRING_PTR(str), n);
2520 while (n <= len/2) {
2521 memcpy(ptr2 + n, ptr2, n);
2522 n *= 2;
2523 }
2524 memcpy(ptr2 + n, ptr2, len-n);
2525 }
2526 STR_SET_LEN(str2, len);
2527 TERM_FILL(&ptr2[len], termlen);
2528 rb_enc_cr_str_copy_for_substr(str2, str);
2529
2530 return str2;
2531}
2532
2533/*
2534 * call-seq:
2535 * string % object -> new_string
2536 *
2537 * Returns the result of formatting +object+ into the format specification +self+
2538 * (see Kernel#sprintf for formatting details):
2539 *
2540 * "%05d" % 123 # => "00123"
2541 *
2542 * If +self+ contains multiple substitutions, +object+ must be
2543 * an Array or Hash containing the values to be substituted:
2544 *
2545 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2546 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2547 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2548 *
2549 */
2550
2551static VALUE
2552rb_str_format_m(VALUE str, VALUE arg)
2553{
2554 VALUE tmp = rb_check_array_type(arg);
2555
2556 if (!NIL_P(tmp)) {
2557 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2558 }
2559 return rb_str_format(1, &arg, str);
2560}
2561
2562static inline void
2563rb_check_lockedtmp(VALUE str)
2564{
2565 if (FL_TEST(str, STR_TMPLOCK)) {
2566 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2567 }
2568}
2569
2570// If none of these flags are set, we know we have an modifiable string.
2571// If any is set, we need to do more detailed checks.
2572#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2573static inline void
2574str_modifiable(VALUE str)
2575{
2576 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2577 if (CHILLED_STRING_P(str)) {
2578 CHILLED_STRING_MUTATED(str);
2579 }
2580 rb_check_lockedtmp(str);
2581 rb_check_frozen(str);
2582 }
2583}
2584
2585static inline int
2586str_dependent_p(VALUE str)
2587{
2588 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2589 return FALSE;
2590 }
2591 else {
2592 return TRUE;
2593 }
2594}
2595
2596// If none of these flags are set, we know we have an independent string.
2597// If any is set, we need to do more detailed checks.
2598#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2599static inline int
2600str_independent(VALUE str)
2601{
2602 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2603 str_modifiable(str);
2604 return !str_dependent_p(str);
2605 }
2606 return TRUE;
2607}
2608
2609static void
2610str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2611{
2612 char *ptr;
2613 char *oldptr;
2614 long capa = len + expand;
2615
2616 if (len > capa) len = capa;
2617
2618 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2619 ptr = RSTRING(str)->as.heap.ptr;
2620 STR_SET_EMBED(str);
2621 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2622 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2623 STR_SET_LEN(str, len);
2624 return;
2625 }
2626
2627 ptr = ALLOC_N(char, (size_t)capa + termlen);
2628 oldptr = RSTRING_PTR(str);
2629 if (oldptr) {
2630 memcpy(ptr, oldptr, len);
2631 }
2632 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2633 xfree(oldptr);
2634 }
2635 STR_SET_NOEMBED(str);
2636 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2637 TERM_FILL(ptr + len, termlen);
2638 RSTRING(str)->as.heap.ptr = ptr;
2639 STR_SET_LEN(str, len);
2640 RSTRING(str)->as.heap.aux.capa = capa;
2641}
2642
2643void
2644rb_str_modify(VALUE str)
2645{
2646 if (!str_independent(str))
2647 str_make_independent(str);
2649}
2650
2651void
2653{
2654 int termlen = TERM_LEN(str);
2655 long len = RSTRING_LEN(str);
2656
2657 if (expand < 0) {
2658 rb_raise(rb_eArgError, "negative expanding string size");
2659 }
2660 if (expand >= LONG_MAX - len) {
2661 rb_raise(rb_eArgError, "string size too big");
2662 }
2663
2664 if (!str_independent(str)) {
2665 str_make_independent_expand(str, len, expand, termlen);
2666 }
2667 else if (expand > 0) {
2668 RESIZE_CAPA_TERM(str, len + expand, termlen);
2669 }
2671}
2672
2673/* As rb_str_modify(), but don't clear coderange */
2674static void
2675str_modify_keep_cr(VALUE str)
2676{
2677 if (!str_independent(str))
2678 str_make_independent(str);
2680 /* Force re-scan later */
2682}
2683
2684static inline void
2685str_discard(VALUE str)
2686{
2687 str_modifiable(str);
2688 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2689 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2690 RSTRING(str)->as.heap.ptr = 0;
2691 STR_SET_LEN(str, 0);
2692 }
2693}
2694
2695void
2697{
2698 int encindex = rb_enc_get_index(str);
2699
2700 if (RB_UNLIKELY(encindex == -1)) {
2701 rb_raise(rb_eTypeError, "not encoding capable object");
2702 }
2703
2704 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2705 return;
2706 }
2707
2708 rb_encoding *enc = rb_enc_from_index(encindex);
2709 if (!rb_enc_asciicompat(enc)) {
2710 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2711 }
2712}
2713
2714VALUE
2716{
2717 VALUE s = *ptr;
2718 if (!RB_TYPE_P(s, T_STRING)) {
2719 s = rb_str_to_str(s);
2720 *ptr = s;
2721 }
2722 return s;
2723}
2724
2725char *
2727{
2728 VALUE str = rb_string_value(ptr);
2729 return RSTRING_PTR(str);
2730}
2731
2732static int
2733zero_filled(const char *s, int n)
2734{
2735 for (; n > 0; --n) {
2736 if (*s++) return 0;
2737 }
2738 return 1;
2739}
2740
2741static const char *
2742str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2743{
2744 const char *e = s + len;
2745
2746 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2747 if (zero_filled(s, minlen)) return s;
2748 }
2749 return 0;
2750}
2751
2752static char *
2753str_fill_term(VALUE str, char *s, long len, int termlen)
2754{
2755 /* This function assumes that (capa + termlen) bytes of memory
2756 * is allocated, like many other functions in this file.
2757 */
2758 if (str_dependent_p(str)) {
2759 if (!zero_filled(s + len, termlen))
2760 str_make_independent_expand(str, len, 0L, termlen);
2761 }
2762 else {
2763 TERM_FILL(s + len, termlen);
2764 return s;
2765 }
2766 return RSTRING_PTR(str);
2767}
2768
2769void
2770rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2771{
2772 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2773 long len = RSTRING_LEN(str);
2774
2775 RUBY_ASSERT(capa >= len);
2776 if (capa - len < termlen) {
2777 rb_check_lockedtmp(str);
2778 str_make_independent_expand(str, len, 0L, termlen);
2779 }
2780 else if (str_dependent_p(str)) {
2781 if (termlen > oldtermlen)
2782 str_make_independent_expand(str, len, 0L, termlen);
2783 }
2784 else {
2785 if (!STR_EMBED_P(str)) {
2786 /* modify capa instead of realloc */
2787 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2788 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2789 }
2790 if (termlen > oldtermlen) {
2791 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2792 }
2793 }
2794
2795 return;
2796}
2797
2798static char *
2799str_null_check(VALUE str, int *w)
2800{
2801 char *s = RSTRING_PTR(str);
2802 long len = RSTRING_LEN(str);
2803 rb_encoding *enc = rb_enc_get(str);
2804 const int minlen = rb_enc_mbminlen(enc);
2805
2806 if (minlen > 1) {
2807 *w = 1;
2808 if (str_null_char(s, len, minlen, enc)) {
2809 return NULL;
2810 }
2811 return str_fill_term(str, s, len, minlen);
2812 }
2813 *w = 0;
2814 if (!s || memchr(s, 0, len)) {
2815 return NULL;
2816 }
2817 if (s[len]) {
2818 s = str_fill_term(str, s, len, minlen);
2819 }
2820 return s;
2821}
2822
2823char *
2824rb_str_to_cstr(VALUE str)
2825{
2826 int w;
2827 return str_null_check(str, &w);
2828}
2829
2830char *
2832{
2833 VALUE str = rb_string_value(ptr);
2834 int w;
2835 char *s = str_null_check(str, &w);
2836 if (!s) {
2837 if (w) {
2838 rb_raise(rb_eArgError, "string contains null char");
2839 }
2840 rb_raise(rb_eArgError, "string contains null byte");
2841 }
2842 return s;
2843}
2844
2845char *
2846rb_str_fill_terminator(VALUE str, const int newminlen)
2847{
2848 char *s = RSTRING_PTR(str);
2849 long len = RSTRING_LEN(str);
2850 return str_fill_term(str, s, len, newminlen);
2851}
2852
2853VALUE
2855{
2856 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2857 return str;
2858}
2859
2860/*
2861 * call-seq:
2862 * String.try_convert(object) -> object, new_string, or nil
2863 *
2864 * If +object+ is a +String+ object, returns +object+.
2865 *
2866 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2867 * calls <tt>object.to_str</tt> and returns the result.
2868 *
2869 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2870 *
2871 * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2872 */
2873static VALUE
2874rb_str_s_try_convert(VALUE dummy, VALUE str)
2875{
2876 return rb_check_string_type(str);
2877}
2878
2879static char*
2880str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2881{
2882 long nth = *nthp;
2883 if (rb_enc_mbmaxlen(enc) == 1) {
2884 p += nth;
2885 }
2886 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2887 p += nth * rb_enc_mbmaxlen(enc);
2888 }
2889 else if (rb_enc_asciicompat(enc)) {
2890 const char *p2, *e2;
2891 int n;
2892
2893 while (p < e && 0 < nth) {
2894 e2 = p + nth;
2895 if (e < e2) {
2896 *nthp = nth;
2897 return (char *)e;
2898 }
2899 if (ISASCII(*p)) {
2900 p2 = search_nonascii(p, e2);
2901 if (!p2) {
2902 nth -= e2 - p;
2903 *nthp = nth;
2904 return (char *)e2;
2905 }
2906 nth -= p2 - p;
2907 p = p2;
2908 }
2909 n = rb_enc_mbclen(p, e, enc);
2910 p += n;
2911 nth--;
2912 }
2913 *nthp = nth;
2914 if (nth != 0) {
2915 return (char *)e;
2916 }
2917 return (char *)p;
2918 }
2919 else {
2920 while (p < e && nth--) {
2921 p += rb_enc_mbclen(p, e, enc);
2922 }
2923 }
2924 if (p > e) p = e;
2925 *nthp = nth;
2926 return (char*)p;
2927}
2928
2929char*
2930rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2931{
2932 return str_nth_len(p, e, &nth, enc);
2933}
2934
2935static char*
2936str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2937{
2938 if (singlebyte)
2939 p += nth;
2940 else {
2941 p = str_nth_len(p, e, &nth, enc);
2942 }
2943 if (!p) return 0;
2944 if (p > e) p = e;
2945 return (char *)p;
2946}
2947
2948/* char offset to byte offset */
2949static long
2950str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2951{
2952 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2953 if (!pp) return e - p;
2954 return pp - p;
2955}
2956
2957long
2958rb_str_offset(VALUE str, long pos)
2959{
2960 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2961 STR_ENC_GET(str), single_byte_optimizable(str));
2962}
2963
2964#ifdef NONASCII_MASK
2965static char *
2966str_utf8_nth(const char *p, const char *e, long *nthp)
2967{
2968 long nth = *nthp;
2969 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2970 const uintptr_t *s, *t;
2971 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2972 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2973 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2974 while (p < (const char *)s) {
2975 if (is_utf8_lead_byte(*p)) nth--;
2976 p++;
2977 }
2978 do {
2979 nth -= count_utf8_lead_bytes_with_word(s);
2980 s++;
2981 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2982 p = (char *)s;
2983 }
2984 while (p < e) {
2985 if (is_utf8_lead_byte(*p)) {
2986 if (nth == 0) break;
2987 nth--;
2988 }
2989 p++;
2990 }
2991 *nthp = nth;
2992 return (char *)p;
2993}
2994
2995static long
2996str_utf8_offset(const char *p, const char *e, long nth)
2997{
2998 const char *pp = str_utf8_nth(p, e, &nth);
2999 return pp - p;
3000}
3001#endif
3002
3003/* byte offset to char offset */
3004long
3005rb_str_sublen(VALUE str, long pos)
3006{
3007 if (single_byte_optimizable(str) || pos < 0)
3008 return pos;
3009 else {
3010 char *p = RSTRING_PTR(str);
3011 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3012 }
3013}
3014
3015static VALUE
3016str_subseq(VALUE str, long beg, long len)
3017{
3018 VALUE str2;
3019
3020 RUBY_ASSERT(beg >= 0);
3021 RUBY_ASSERT(len >= 0);
3022 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3023
3024 const int termlen = TERM_LEN(str);
3025 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3026 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3027 RB_GC_GUARD(str);
3028 return str2;
3029 }
3030
3031 str2 = str_alloc_heap(rb_cString);
3032 if (str_embed_capa(str2) >= len + termlen) {
3033 char *ptr2 = RSTRING(str2)->as.embed.ary;
3034 STR_SET_EMBED(str2);
3035 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3036 TERM_FILL(ptr2+len, termlen);
3037
3038 STR_SET_LEN(str2, len);
3039 RB_GC_GUARD(str);
3040 }
3041 else {
3042 str_replace_shared(str2, str);
3043 RUBY_ASSERT(!STR_EMBED_P(str2));
3044 ENC_CODERANGE_CLEAR(str2);
3045 RSTRING(str2)->as.heap.ptr += beg;
3046 if (RSTRING_LEN(str2) > len) {
3047 STR_SET_LEN(str2, len);
3048 }
3049 }
3050
3051 return str2;
3052}
3053
3054VALUE
3055rb_str_subseq(VALUE str, long beg, long len)
3056{
3057 VALUE str2 = str_subseq(str, beg, len);
3058 rb_enc_cr_str_copy_for_substr(str2, str);
3059 return str2;
3060}
3061
3062char *
3063rb_str_subpos(VALUE str, long beg, long *lenp)
3064{
3065 long len = *lenp;
3066 long slen = -1L;
3067 const long blen = RSTRING_LEN(str);
3068 rb_encoding *enc = STR_ENC_GET(str);
3069 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3070
3071 if (len < 0) return 0;
3072 if (beg < 0 && -beg < 0) return 0;
3073 if (!blen) {
3074 len = 0;
3075 }
3076 if (single_byte_optimizable(str)) {
3077 if (beg > blen) return 0;
3078 if (beg < 0) {
3079 beg += blen;
3080 if (beg < 0) return 0;
3081 }
3082 if (len > blen - beg)
3083 len = blen - beg;
3084 if (len < 0) return 0;
3085 p = s + beg;
3086 goto end;
3087 }
3088 if (beg < 0) {
3089 if (len > -beg) len = -beg;
3090 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3091 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3092 beg = -beg;
3093 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3094 p = e;
3095 if (!p) return 0;
3096 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3097 if (!p) return 0;
3098 len = e - p;
3099 goto end;
3100 }
3101 else {
3102 slen = str_strlen(str, enc);
3103 beg += slen;
3104 if (beg < 0) return 0;
3105 p = s + beg;
3106 if (len == 0) goto end;
3107 }
3108 }
3109 else if (beg > 0 && beg > blen) {
3110 return 0;
3111 }
3112 if (len == 0) {
3113 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3114 p = s + beg;
3115 }
3116#ifdef NONASCII_MASK
3117 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3118 enc == rb_utf8_encoding()) {
3119 p = str_utf8_nth(s, e, &beg);
3120 if (beg > 0) return 0;
3121 len = str_utf8_offset(p, e, len);
3122 }
3123#endif
3124 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3125 int char_sz = rb_enc_mbmaxlen(enc);
3126
3127 p = s + beg * char_sz;
3128 if (p > e) {
3129 return 0;
3130 }
3131 else if (len * char_sz > e - p)
3132 len = e - p;
3133 else
3134 len *= char_sz;
3135 }
3136 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3137 if (beg > 0) return 0;
3138 len = 0;
3139 }
3140 else {
3141 len = str_offset(p, e, len, enc, 0);
3142 }
3143 end:
3144 *lenp = len;
3145 RB_GC_GUARD(str);
3146 return p;
3147}
3148
3149static VALUE str_substr(VALUE str, long beg, long len, int empty);
3150
3151VALUE
3152rb_str_substr(VALUE str, long beg, long len)
3153{
3154 return str_substr(str, beg, len, TRUE);
3155}
3156
3157VALUE
3158rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3159{
3160 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3161}
3162
3163static VALUE
3164str_substr(VALUE str, long beg, long len, int empty)
3165{
3166 char *p = rb_str_subpos(str, beg, &len);
3167
3168 if (!p) return Qnil;
3169 if (!len && !empty) return Qnil;
3170
3171 beg = p - RSTRING_PTR(str);
3172
3173 VALUE str2 = str_subseq(str, beg, len);
3174 rb_enc_cr_str_copy_for_substr(str2, str);
3175 return str2;
3176}
3177
3178/* :nodoc: */
3179VALUE
3181{
3182 if (CHILLED_STRING_P(str)) {
3183 FL_UNSET_RAW(str, STR_CHILLED);
3184 }
3185
3186 if (OBJ_FROZEN(str)) return str;
3187 rb_str_resize(str, RSTRING_LEN(str));
3188 return rb_obj_freeze(str);
3189}
3190
3191/*
3192 * call-seq:
3193 * +string -> new_string or self
3194 *
3195 * Returns +self+ if +self+ is not frozen and can be mutated
3196 * without warning issuance.
3197 *
3198 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3199 */
3200static VALUE
3201str_uplus(VALUE str)
3202{
3203 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3204 return rb_str_dup(str);
3205 }
3206 else {
3207 return str;
3208 }
3209}
3210
3211/*
3212 * call-seq:
3213 * -string -> frozen_string
3214 * dedup -> frozen_string
3215 *
3216 * Returns a frozen, possibly pre-existing copy of the string.
3217 *
3218 * The returned +String+ will be deduplicated as long as it does not have
3219 * any instance variables set on it and is not a String subclass.
3220 *
3221 * Note that <tt>-string</tt> variant is more convenient for defining
3222 * constants:
3223 *
3224 * FILENAME = -'config/database.yml'
3225 *
3226 * while +dedup+ is better suitable for using the method in chains
3227 * of calculations:
3228 *
3229 * @url_list.concat(urls.map(&:dedup))
3230 *
3231 */
3232static VALUE
3233str_uminus(VALUE str)
3234{
3235 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3236 str = rb_str_dup(str);
3237 }
3238 return rb_fstring(str);
3239}
3240
3241RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3242#define rb_str_dup_frozen rb_str_new_frozen
3243
3244VALUE
3246{
3247 if (FL_TEST(str, STR_TMPLOCK)) {
3248 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3249 }
3250 FL_SET(str, STR_TMPLOCK);
3251 return str;
3252}
3253
3254VALUE
3256{
3257 if (!FL_TEST(str, STR_TMPLOCK)) {
3258 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3259 }
3260 FL_UNSET(str, STR_TMPLOCK);
3261 return str;
3262}
3263
3264VALUE
3265rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3266{
3267 rb_str_locktmp(str);
3268 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3269}
3270
3271void
3273{
3274 long capa;
3275 const int termlen = TERM_LEN(str);
3276
3277 str_modifiable(str);
3278 if (STR_SHARED_P(str)) {
3279 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3280 }
3281 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3282 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3283 }
3284
3285 int cr = ENC_CODERANGE(str);
3286 if (len == 0) {
3287 /* Empty string does not contain non-ASCII */
3289 }
3290 else if (cr == ENC_CODERANGE_UNKNOWN) {
3291 /* Leave unknown. */
3292 }
3293 else if (len > RSTRING_LEN(str)) {
3294 if (ENC_CODERANGE_CLEAN_P(cr)) {
3295 /* Update the coderange regarding the extended part. */
3296 const char *const prev_end = RSTRING_END(str);
3297 const char *const new_end = RSTRING_PTR(str) + len;
3298 rb_encoding *enc = rb_enc_get(str);
3299 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3300 ENC_CODERANGE_SET(str, cr);
3301 }
3302 else if (cr == ENC_CODERANGE_BROKEN) {
3303 /* May be valid now, by appended part. */
3305 }
3306 }
3307 else if (len < RSTRING_LEN(str)) {
3308 if (cr != ENC_CODERANGE_7BIT) {
3309 /* ASCII-only string is keeping after truncated. Valid
3310 * and broken may be invalid or valid, leave unknown. */
3312 }
3313 }
3314
3315 STR_SET_LEN(str, len);
3316 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3317}
3318
3319VALUE
3320rb_str_resize(VALUE str, long len)
3321{
3322 if (len < 0) {
3323 rb_raise(rb_eArgError, "negative string size (or size too big)");
3324 }
3325
3326 int independent = str_independent(str);
3327 long slen = RSTRING_LEN(str);
3328 const int termlen = TERM_LEN(str);
3329
3330 if (slen > len || (termlen != 1 && slen < len)) {
3332 }
3333
3334 {
3335 long capa;
3336 if (STR_EMBED_P(str)) {
3337 if (len == slen) return str;
3338 if (str_embed_capa(str) >= len + termlen) {
3339 STR_SET_LEN(str, len);
3340 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3341 return str;
3342 }
3343 str_make_independent_expand(str, slen, len - slen, termlen);
3344 }
3345 else if (str_embed_capa(str) >= len + termlen) {
3346 char *ptr = STR_HEAP_PTR(str);
3347 STR_SET_EMBED(str);
3348 if (slen > len) slen = len;
3349 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3350 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3351 STR_SET_LEN(str, len);
3352 if (independent) ruby_xfree(ptr);
3353 return str;
3354 }
3355 else if (!independent) {
3356 if (len == slen) return str;
3357 str_make_independent_expand(str, slen, len - slen, termlen);
3358 }
3359 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3360 (capa - len) > (len < 1024 ? len : 1024)) {
3361 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3362 (size_t)len + termlen, STR_HEAP_SIZE(str));
3363 RSTRING(str)->as.heap.aux.capa = len;
3364 }
3365 else if (len == slen) return str;
3366 STR_SET_LEN(str, len);
3367 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3368 }
3369 return str;
3370}
3371
3372static void
3373str_ensure_available_capa(VALUE str, long len)
3374{
3375 str_modify_keep_cr(str);
3376
3377 const int termlen = TERM_LEN(str);
3378 long olen = RSTRING_LEN(str);
3379
3380 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3381 rb_raise(rb_eArgError, "string sizes too big");
3382 }
3383
3384 long total = olen + len;
3385 long capa = str_capacity(str, termlen);
3386
3387 if (capa < total) {
3388 if (total >= LONG_MAX / 2) {
3389 capa = total;
3390 }
3391 while (total > capa) {
3392 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3393 }
3394 RESIZE_CAPA_TERM(str, capa, termlen);
3395 }
3396}
3397
3398static VALUE
3399str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3400{
3401 if (keep_cr) {
3402 str_modify_keep_cr(str);
3403 }
3404 else {
3405 rb_str_modify(str);
3406 }
3407 if (len == 0) return 0;
3408
3409 long total, olen, off = -1;
3410 char *sptr;
3411 const int termlen = TERM_LEN(str);
3412
3413 RSTRING_GETMEM(str, sptr, olen);
3414 if (ptr >= sptr && ptr <= sptr + olen) {
3415 off = ptr - sptr;
3416 }
3417
3418 long capa = str_capacity(str, termlen);
3419
3420 if (olen > LONG_MAX - len) {
3421 rb_raise(rb_eArgError, "string sizes too big");
3422 }
3423 total = olen + len;
3424 if (capa < total) {
3425 if (total >= LONG_MAX / 2) {
3426 capa = total;
3427 }
3428 while (total > capa) {
3429 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3430 }
3431 RESIZE_CAPA_TERM(str, capa, termlen);
3432 sptr = RSTRING_PTR(str);
3433 }
3434 if (off != -1) {
3435 ptr = sptr + off;
3436 }
3437 memcpy(sptr + olen, ptr, len);
3438 STR_SET_LEN(str, total);
3439 TERM_FILL(sptr + total, termlen); /* sentinel */
3440
3441 return str;
3442}
3443
3444#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3445#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3446
3447VALUE
3448rb_str_cat(VALUE str, const char *ptr, long len)
3449{
3450 if (len == 0) return str;
3451 if (len < 0) {
3452 rb_raise(rb_eArgError, "negative string size (or size too big)");
3453 }
3454 return str_buf_cat(str, ptr, len);
3455}
3456
3457VALUE
3458rb_str_cat_cstr(VALUE str, const char *ptr)
3459{
3460 must_not_null(ptr);
3461 return rb_str_buf_cat(str, ptr, strlen(ptr));
3462}
3463
3464static void
3465rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3466{
3467 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3468
3469 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3470 if (UNLIKELY(!str_independent(str))) {
3471 str_make_independent(str);
3472 }
3473
3474 long string_length = -1;
3475 const int null_terminator_length = 1;
3476 char *sptr;
3477 RSTRING_GETMEM(str, sptr, string_length);
3478
3479 // Ensure the resulting string wouldn't be too long.
3480 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3481 rb_raise(rb_eArgError, "string sizes too big");
3482 }
3483
3484 long string_capacity = str_capacity(str, null_terminator_length);
3485
3486 // Get the code range before any modifications since those might clear the code range.
3487 int cr = ENC_CODERANGE(str);
3488
3489 // Check if the string has spare string_capacity to write the new byte.
3490 if (LIKELY(string_capacity >= string_length + 1)) {
3491 // In fast path we can write the new byte and note the string's new length.
3492 sptr[string_length] = byte;
3493 STR_SET_LEN(str, string_length + 1);
3494 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3495 }
3496 else {
3497 // If there's not enough string_capacity, make a call into the general string concatenation function.
3498 str_buf_cat(str, (char *)&byte, 1);
3499 }
3500
3501 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3502 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3503 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3504 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3505 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3506 if (ISASCII(byte)) {
3508 }
3509 else {
3511
3512 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3513 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3514 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3515 }
3516 }
3517 }
3518}
3519
3520RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3521RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3522RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3523
3524static VALUE
3525rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3526 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3527{
3528 int str_encindex = ENCODING_GET(str);
3529 int res_encindex;
3530 int str_cr, res_cr;
3531 rb_encoding *str_enc, *ptr_enc;
3532
3533 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3534
3535 if (str_encindex == ptr_encindex) {
3536 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3537 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3538 }
3539 }
3540 else {
3541 str_enc = rb_enc_from_index(str_encindex);
3542 ptr_enc = rb_enc_from_index(ptr_encindex);
3543 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3544 if (len == 0)
3545 return str;
3546 if (RSTRING_LEN(str) == 0) {
3547 rb_str_buf_cat(str, ptr, len);
3548 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3549 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3550 return str;
3551 }
3552 goto incompatible;
3553 }
3554 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3555 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3556 }
3557 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3558 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3559 str_cr = rb_enc_str_coderange(str);
3560 }
3561 }
3562 }
3563 if (ptr_cr_ret)
3564 *ptr_cr_ret = ptr_cr;
3565
3566 if (str_encindex != ptr_encindex &&
3567 str_cr != ENC_CODERANGE_7BIT &&
3568 ptr_cr != ENC_CODERANGE_7BIT) {
3569 str_enc = rb_enc_from_index(str_encindex);
3570 ptr_enc = rb_enc_from_index(ptr_encindex);
3571 goto incompatible;
3572 }
3573
3574 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3575 res_encindex = str_encindex;
3576 res_cr = ENC_CODERANGE_UNKNOWN;
3577 }
3578 else if (str_cr == ENC_CODERANGE_7BIT) {
3579 if (ptr_cr == ENC_CODERANGE_7BIT) {
3580 res_encindex = str_encindex;
3581 res_cr = ENC_CODERANGE_7BIT;
3582 }
3583 else {
3584 res_encindex = ptr_encindex;
3585 res_cr = ptr_cr;
3586 }
3587 }
3588 else if (str_cr == ENC_CODERANGE_VALID) {
3589 res_encindex = str_encindex;
3590 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3591 res_cr = str_cr;
3592 else
3593 res_cr = ptr_cr;
3594 }
3595 else { /* str_cr == ENC_CODERANGE_BROKEN */
3596 res_encindex = str_encindex;
3597 res_cr = str_cr;
3598 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3599 }
3600
3601 if (len < 0) {
3602 rb_raise(rb_eArgError, "negative string size (or size too big)");
3603 }
3604 str_buf_cat(str, ptr, len);
3605 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3606 return str;
3607
3608 incompatible:
3609 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3610 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3612}
3613
3614VALUE
3615rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3616{
3617 return rb_enc_cr_str_buf_cat(str, ptr, len,
3618 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3619}
3620
3621VALUE
3622rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3623{
3624 /* ptr must reference NUL terminated ASCII string. */
3625 int encindex = ENCODING_GET(str);
3626 rb_encoding *enc = rb_enc_from_index(encindex);
3627 if (rb_enc_asciicompat(enc)) {
3628 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3629 encindex, ENC_CODERANGE_7BIT, 0);
3630 }
3631 else {
3632 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3633 while (*ptr) {
3634 unsigned int c = (unsigned char)*ptr;
3635 int len = rb_enc_codelen(c, enc);
3636 rb_enc_mbcput(c, buf, enc);
3637 rb_enc_cr_str_buf_cat(str, buf, len,
3638 encindex, ENC_CODERANGE_VALID, 0);
3639 ptr++;
3640 }
3641 return str;
3642 }
3643}
3644
3645VALUE
3647{
3648 int str2_cr = rb_enc_str_coderange(str2);
3649
3650 if (str_enc_fastpath(str)) {
3651 switch (str2_cr) {
3652 case ENC_CODERANGE_7BIT:
3653 // If RHS is 7bit we can do simple concatenation
3654 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3655 RB_GC_GUARD(str2);
3656 return str;
3658 // If RHS is valid, we can do simple concatenation if encodings are the same
3659 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3660 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3661 int str_cr = ENC_CODERANGE(str);
3662 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3663 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3664 }
3665 RB_GC_GUARD(str2);
3666 return str;
3667 }
3668 }
3669 }
3670
3671 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3672 ENCODING_GET(str2), str2_cr, &str2_cr);
3673
3674 ENC_CODERANGE_SET(str2, str2_cr);
3675
3676 return str;
3677}
3678
3679VALUE
3681{
3682 StringValue(str2);
3683 return rb_str_buf_append(str, str2);
3684}
3685
3686VALUE
3687rb_str_concat_literals(size_t num, const VALUE *strary)
3688{
3689 VALUE str;
3690 size_t i, s = 0;
3691 unsigned long len = 1;
3692
3693 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3694 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3695
3696 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3697 str = rb_str_buf_new(len);
3698 str_enc_copy_direct(str, strary[0]);
3699
3700 for (i = s; i < num; ++i) {
3701 const VALUE v = strary[i];
3702 int encidx = ENCODING_GET(v);
3703
3704 rb_str_buf_append(str, v);
3705 if (encidx != ENCINDEX_US_ASCII) {
3706 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3707 rb_enc_set_index(str, encidx);
3708 }
3709 }
3710 return str;
3711}
3712
3713/*
3714 * call-seq:
3715 * concat(*objects) -> string
3716 *
3717 * Concatenates each object in +objects+ to +self+ and returns +self+:
3718 *
3719 * s = 'foo'
3720 * s.concat('bar', 'baz') # => "foobarbaz"
3721 * s # => "foobarbaz"
3722 *
3723 * For each given object +object+ that is an Integer,
3724 * the value is considered a codepoint and converted to a character before concatenation:
3725 *
3726 * s = 'foo'
3727 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3728 *
3729 * Related: String#<<, which takes a single argument.
3730 */
3731static VALUE
3732rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3733{
3734 str_modifiable(str);
3735
3736 if (argc == 1) {
3737 return rb_str_concat(str, argv[0]);
3738 }
3739 else if (argc > 1) {
3740 int i;
3741 VALUE arg_str = rb_str_tmp_new(0);
3742 rb_enc_copy(arg_str, str);
3743 for (i = 0; i < argc; i++) {
3744 rb_str_concat(arg_str, argv[i]);
3745 }
3746 rb_str_buf_append(str, arg_str);
3747 }
3748
3749 return str;
3750}
3751
3752/*
3753 * call-seq:
3754 * append_as_bytes(*objects) -> string
3755 *
3756 * Concatenates each object in +objects+ into +self+ without any encoding
3757 * validation or conversion and returns +self+:
3758 *
3759 * s = 'foo'
3760 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3761 * s.valid_encoding? # => false
3762 * s.append_as_bytes("\xAC 12")
3763 * s.valid_encoding? # => true
3764 *
3765 * For each given object +object+ that is an Integer,
3766 * the value is considered a Byte. If the Integer is bigger
3767 * than one byte, only the lower byte is considered, similar to String#setbyte:
3768 *
3769 * s = ""
3770 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3771 *
3772 * Related: String#<<, String#concat, which do an encoding aware concatenation.
3773 */
3774
3775VALUE
3776rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3777{
3778 long needed_capacity = 0;
3779 volatile VALUE t0;
3780 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3781
3782 for (int index = 0; index < argc; index++) {
3783 VALUE obj = argv[index];
3784 enum ruby_value_type type = types[index] = rb_type(obj);
3785 switch (type) {
3786 case T_FIXNUM:
3787 case T_BIGNUM:
3788 needed_capacity++;
3789 break;
3790 case T_STRING:
3791 needed_capacity += RSTRING_LEN(obj);
3792 break;
3793 default:
3794 rb_raise(
3796 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3797 rb_obj_class(obj)
3798 );
3799 break;
3800 }
3801 }
3802
3803 str_ensure_available_capa(str, needed_capacity);
3804 char *sptr = RSTRING_END(str);
3805
3806 for (int index = 0; index < argc; index++) {
3807 VALUE obj = argv[index];
3808 enum ruby_value_type type = types[index];
3809 switch (type) {
3810 case T_FIXNUM:
3811 case T_BIGNUM: {
3812 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3813 char byte = (char)(NUM2INT(obj) & 0xFF);
3814 *sptr = byte;
3815 sptr++;
3816 break;
3817 }
3818 case T_STRING: {
3819 const char *ptr;
3820 long len;
3821 RSTRING_GETMEM(obj, ptr, len);
3822 memcpy(sptr, ptr, len);
3823 sptr += len;
3824 break;
3825 }
3826 default:
3827 rb_bug("append_as_bytes arguments should have been validated");
3828 }
3829 }
3830
3831 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3832 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3833
3834 int cr = ENC_CODERANGE(str);
3835 switch (cr) {
3836 case ENC_CODERANGE_7BIT: {
3837 for (int index = 0; index < argc; index++) {
3838 VALUE obj = argv[index];
3839 enum ruby_value_type type = types[index];
3840 switch (type) {
3841 case T_FIXNUM:
3842 case T_BIGNUM: {
3843 if (!ISASCII(NUM2INT(obj))) {
3844 goto clear_cr;
3845 }
3846 break;
3847 }
3848 case T_STRING: {
3849 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3850 goto clear_cr;
3851 }
3852 break;
3853 }
3854 default:
3855 rb_bug("append_as_bytes arguments should have been validated");
3856 }
3857 }
3858 break;
3859 }
3861 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3862 goto keep_cr;
3863 }
3864 else {
3865 goto clear_cr;
3866 }
3867 break;
3868 default:
3869 goto clear_cr;
3870 break;
3871 }
3872
3873 RB_GC_GUARD(t0);
3874
3875 clear_cr:
3876 // If no fast path was hit, we clear the coderange.
3877 // append_as_bytes is predominently meant to be used in
3878 // buffering situation, hence it's likely the coderange
3879 // will never be scanned, so it's not worth spending time
3880 // precomputing the coderange except for simple and common
3881 // situations.
3883 keep_cr:
3884 return str;
3885}
3886
3887/*
3888 * call-seq:
3889 * string << object -> string
3890 *
3891 * Concatenates +object+ to +self+ and returns +self+:
3892 *
3893 * s = 'foo'
3894 * s << 'bar' # => "foobar"
3895 * s # => "foobar"
3896 *
3897 * If +object+ is an Integer,
3898 * the value is considered a codepoint and converted to a character before concatenation:
3899 *
3900 * s = 'foo'
3901 * s << 33 # => "foo!"
3902 *
3903 * If that codepoint is not representable in the encoding of
3904 * _string_, RangeError is raised.
3905 *
3906 * s = 'foo'
3907 * s.encoding # => <Encoding:UTF-8>
3908 * s << 0x00110000 # 1114112 out of char range (RangeError)
3909 * s = 'foo'.encode('EUC-JP')
3910 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3911 *
3912 * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3913 * is automatically promoted to ASCII-8BIT.
3914 *
3915 * s = 'foo'.encode('US-ASCII')
3916 * s << 0xff
3917 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3918 *
3919 * Related: String#concat, which takes multiple arguments.
3920 */
3921VALUE
3923{
3924 unsigned int code;
3925 rb_encoding *enc = STR_ENC_GET(str1);
3926 int encidx;
3927
3928 if (RB_INTEGER_TYPE_P(str2)) {
3929 if (rb_num_to_uint(str2, &code) == 0) {
3930 }
3931 else if (FIXNUM_P(str2)) {
3932 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3933 }
3934 else {
3935 rb_raise(rb_eRangeError, "bignum out of char range");
3936 }
3937 }
3938 else {
3939 return rb_str_append(str1, str2);
3940 }
3941
3942 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3943
3944 if (encidx >= 0) {
3945 rb_str_buf_cat_byte(str1, (unsigned char)code);
3946 }
3947 else {
3948 long pos = RSTRING_LEN(str1);
3949 int cr = ENC_CODERANGE(str1);
3950 int len;
3951 char *buf;
3952
3953 switch (len = rb_enc_codelen(code, enc)) {
3954 case ONIGERR_INVALID_CODE_POINT_VALUE:
3955 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3956 break;
3957 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3958 case 0:
3959 rb_raise(rb_eRangeError, "%u out of char range", code);
3960 break;
3961 }
3962 buf = ALLOCA_N(char, len + 1);
3963 rb_enc_mbcput(code, buf, enc);
3964 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3965 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3966 }
3967 rb_str_resize(str1, pos+len);
3968 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3969 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3971 }
3972 else if (cr == ENC_CODERANGE_BROKEN) {
3974 }
3975 ENC_CODERANGE_SET(str1, cr);
3976 }
3977 return str1;
3978}
3979
3980int
3981rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3982{
3983 int encidx = rb_enc_to_index(enc);
3984
3985 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3986 /* US-ASCII automatically extended to ASCII-8BIT */
3987 if (code > 0xFF) {
3988 rb_raise(rb_eRangeError, "%u out of char range", code);
3989 }
3990 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3991 return ENCINDEX_ASCII_8BIT;
3992 }
3993 return encidx;
3994 }
3995 else {
3996 return -1;
3997 }
3998}
3999
4000/*
4001 * call-seq:
4002 * prepend(*other_strings) -> string
4003 *
4004 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4005 *
4006 * s = 'foo'
4007 * s.prepend('bar', 'baz') # => "barbazfoo"
4008 * s # => "barbazfoo"
4009 *
4010 * Related: String#concat.
4011 */
4012
4013static VALUE
4014rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4015{
4016 str_modifiable(str);
4017
4018 if (argc == 1) {
4019 rb_str_update(str, 0L, 0L, argv[0]);
4020 }
4021 else if (argc > 1) {
4022 int i;
4023 VALUE arg_str = rb_str_tmp_new(0);
4024 rb_enc_copy(arg_str, str);
4025 for (i = 0; i < argc; i++) {
4026 rb_str_append(arg_str, argv[i]);
4027 }
4028 rb_str_update(str, 0L, 0L, arg_str);
4029 }
4030
4031 return str;
4032}
4033
4034st_index_t
4036{
4037 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4038 st_index_t precomputed_hash;
4039 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4040
4041 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4042 return precomputed_hash;
4043 }
4044
4045 return str_do_hash(str);
4046}
4047
4048int
4050{
4051 long len1, len2;
4052 const char *ptr1, *ptr2;
4053 RSTRING_GETMEM(str1, ptr1, len1);
4054 RSTRING_GETMEM(str2, ptr2, len2);
4055 return (len1 != len2 ||
4056 !rb_str_comparable(str1, str2) ||
4057 memcmp(ptr1, ptr2, len1) != 0);
4058}
4059
4060/*
4061 * call-seq:
4062 * hash -> integer
4063 *
4064 * Returns the integer hash value for +self+.
4065 * The value is based on the length, content and encoding of +self+.
4066 *
4067 * Related: Object#hash.
4068 */
4069
4070static VALUE
4071rb_str_hash_m(VALUE str)
4072{
4073 st_index_t hval = rb_str_hash(str);
4074 return ST2FIX(hval);
4075}
4076
4077#define lesser(a,b) (((a)>(b))?(b):(a))
4078
4079int
4081{
4082 int idx1, idx2;
4083 int rc1, rc2;
4084
4085 if (RSTRING_LEN(str1) == 0) return TRUE;
4086 if (RSTRING_LEN(str2) == 0) return TRUE;
4087 idx1 = ENCODING_GET(str1);
4088 idx2 = ENCODING_GET(str2);
4089 if (idx1 == idx2) return TRUE;
4090 rc1 = rb_enc_str_coderange(str1);
4091 rc2 = rb_enc_str_coderange(str2);
4092 if (rc1 == ENC_CODERANGE_7BIT) {
4093 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4094 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4095 return TRUE;
4096 }
4097 if (rc2 == ENC_CODERANGE_7BIT) {
4098 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4099 return TRUE;
4100 }
4101 return FALSE;
4102}
4103
4104int
4106{
4107 long len1, len2;
4108 const char *ptr1, *ptr2;
4109 int retval;
4110
4111 if (str1 == str2) return 0;
4112 RSTRING_GETMEM(str1, ptr1, len1);
4113 RSTRING_GETMEM(str2, ptr2, len2);
4114 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4115 if (len1 == len2) {
4116 if (!rb_str_comparable(str1, str2)) {
4117 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4118 return 1;
4119 return -1;
4120 }
4121 return 0;
4122 }
4123 if (len1 > len2) return 1;
4124 return -1;
4125 }
4126 if (retval > 0) return 1;
4127 return -1;
4128}
4129
4130/*
4131 * call-seq:
4132 * string == object -> true or false
4133 * string === object -> true or false
4134 *
4135 * Returns +true+ if +object+ has the same length and content;
4136 * as +self+; +false+ otherwise:
4137 *
4138 * s = 'foo'
4139 * s == 'foo' # => true
4140 * s == 'food' # => false
4141 * s == 'FOO' # => false
4142 *
4143 * Returns +false+ if the two strings' encodings are not compatible:
4144 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
4145 *
4146 * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4147 * two strings are compared using <code>object.==</code>.
4148 */
4149
4150VALUE
4152{
4153 if (str1 == str2) return Qtrue;
4154 if (!RB_TYPE_P(str2, T_STRING)) {
4155 if (!rb_respond_to(str2, idTo_str)) {
4156 return Qfalse;
4157 }
4158 return rb_equal(str2, str1);
4159 }
4160 return rb_str_eql_internal(str1, str2);
4161}
4162
4163/*
4164 * call-seq:
4165 * eql?(object) -> true or false
4166 *
4167 * Returns +true+ if +object+ has the same length and content;
4168 * as +self+; +false+ otherwise:
4169 *
4170 * s = 'foo'
4171 * s.eql?('foo') # => true
4172 * s.eql?('food') # => false
4173 * s.eql?('FOO') # => false
4174 *
4175 * Returns +false+ if the two strings' encodings are not compatible:
4176 *
4177 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
4178 *
4179 */
4180
4181VALUE
4182rb_str_eql(VALUE str1, VALUE str2)
4183{
4184 if (str1 == str2) return Qtrue;
4185 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4186 return rb_str_eql_internal(str1, str2);
4187}
4188
4189/*
4190 * call-seq:
4191 * string <=> other_string -> -1, 0, 1, or nil
4192 *
4193 * Compares +self+ and +other_string+, returning:
4194 *
4195 * - -1 if +other_string+ is larger.
4196 * - 0 if the two are equal.
4197 * - 1 if +other_string+ is smaller.
4198 * - +nil+ if the two are incomparable.
4199 *
4200 * Examples:
4201 *
4202 * 'foo' <=> 'foo' # => 0
4203 * 'foo' <=> 'food' # => -1
4204 * 'food' <=> 'foo' # => 1
4205 * 'FOO' <=> 'foo' # => -1
4206 * 'foo' <=> 'FOO' # => 1
4207 * 'foo' <=> 1 # => nil
4208 *
4209 */
4210
4211static VALUE
4212rb_str_cmp_m(VALUE str1, VALUE str2)
4213{
4214 int result;
4215 VALUE s = rb_check_string_type(str2);
4216 if (NIL_P(s)) {
4217 return rb_invcmp(str1, str2);
4218 }
4219 result = rb_str_cmp(str1, s);
4220 return INT2FIX(result);
4221}
4222
4223static VALUE str_casecmp(VALUE str1, VALUE str2);
4224static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4225
4226/*
4227 * call-seq:
4228 * casecmp(other_string) -> -1, 0, 1, or nil
4229 *
4230 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4231 *
4232 * - -1 if <tt>other_string.downcase</tt> is larger.
4233 * - 0 if the two are equal.
4234 * - 1 if <tt>other_string.downcase</tt> is smaller.
4235 * - +nil+ if the two are incomparable.
4236 *
4237 * Examples:
4238 *
4239 * 'foo'.casecmp('foo') # => 0
4240 * 'foo'.casecmp('food') # => -1
4241 * 'food'.casecmp('foo') # => 1
4242 * 'FOO'.casecmp('foo') # => 0
4243 * 'foo'.casecmp('FOO') # => 0
4244 * 'foo'.casecmp(1) # => nil
4245 *
4246 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4247 *
4248 * Related: String#casecmp?.
4249 *
4250 */
4251
4252static VALUE
4253rb_str_casecmp(VALUE str1, VALUE str2)
4254{
4255 VALUE s = rb_check_string_type(str2);
4256 if (NIL_P(s)) {
4257 return Qnil;
4258 }
4259 return str_casecmp(str1, s);
4260}
4261
4262static VALUE
4263str_casecmp(VALUE str1, VALUE str2)
4264{
4265 long len;
4266 rb_encoding *enc;
4267 const char *p1, *p1end, *p2, *p2end;
4268
4269 enc = rb_enc_compatible(str1, str2);
4270 if (!enc) {
4271 return Qnil;
4272 }
4273
4274 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4275 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4276 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4277 while (p1 < p1end && p2 < p2end) {
4278 if (*p1 != *p2) {
4279 unsigned int c1 = TOLOWER(*p1 & 0xff);
4280 unsigned int c2 = TOLOWER(*p2 & 0xff);
4281 if (c1 != c2)
4282 return INT2FIX(c1 < c2 ? -1 : 1);
4283 }
4284 p1++;
4285 p2++;
4286 }
4287 }
4288 else {
4289 while (p1 < p1end && p2 < p2end) {
4290 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4291 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4292
4293 if (0 <= c1 && 0 <= c2) {
4294 c1 = TOLOWER(c1);
4295 c2 = TOLOWER(c2);
4296 if (c1 != c2)
4297 return INT2FIX(c1 < c2 ? -1 : 1);
4298 }
4299 else {
4300 int r;
4301 l1 = rb_enc_mbclen(p1, p1end, enc);
4302 l2 = rb_enc_mbclen(p2, p2end, enc);
4303 len = l1 < l2 ? l1 : l2;
4304 r = memcmp(p1, p2, len);
4305 if (r != 0)
4306 return INT2FIX(r < 0 ? -1 : 1);
4307 if (l1 != l2)
4308 return INT2FIX(l1 < l2 ? -1 : 1);
4309 }
4310 p1 += l1;
4311 p2 += l2;
4312 }
4313 }
4314 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4315 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4316 return INT2FIX(-1);
4317}
4318
4319/*
4320 * call-seq:
4321 * casecmp?(other_string) -> true, false, or nil
4322 *
4323 * Returns +true+ if +self+ and +other_string+ are equal after
4324 * Unicode case folding, otherwise +false+:
4325 *
4326 * 'foo'.casecmp?('foo') # => true
4327 * 'foo'.casecmp?('food') # => false
4328 * 'food'.casecmp?('foo') # => false
4329 * 'FOO'.casecmp?('foo') # => true
4330 * 'foo'.casecmp?('FOO') # => true
4331 *
4332 * Returns +nil+ if the two values are incomparable:
4333 *
4334 * 'foo'.casecmp?(1) # => nil
4335 *
4336 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4337 *
4338 * Related: String#casecmp.
4339 *
4340 */
4341
4342static VALUE
4343rb_str_casecmp_p(VALUE str1, VALUE str2)
4344{
4345 VALUE s = rb_check_string_type(str2);
4346 if (NIL_P(s)) {
4347 return Qnil;
4348 }
4349 return str_casecmp_p(str1, s);
4350}
4351
4352static VALUE
4353str_casecmp_p(VALUE str1, VALUE str2)
4354{
4355 rb_encoding *enc;
4356 VALUE folded_str1, folded_str2;
4357 VALUE fold_opt = sym_fold;
4358
4359 enc = rb_enc_compatible(str1, str2);
4360 if (!enc) {
4361 return Qnil;
4362 }
4363
4364 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4365 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4366
4367 return rb_str_eql(folded_str1, folded_str2);
4368}
4369
4370static long
4371strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4372 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4373{
4374 const char *search_start = str_ptr;
4375 long pos, search_len = str_len - offset;
4376
4377 for (;;) {
4378 const char *t;
4379 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4380 if (pos < 0) return pos;
4381 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4382 if (t == search_start + pos) break;
4383 search_len -= t - search_start;
4384 if (search_len <= 0) return -1;
4385 offset += t - search_start;
4386 search_start = t;
4387 }
4388 return pos + offset;
4389}
4390
4391/* found index in byte */
4392#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4393#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4394
4395static long
4396rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4397{
4398 const char *str_ptr, *str_ptr_end, *sub_ptr;
4399 long str_len, sub_len;
4400 rb_encoding *enc;
4401
4402 enc = rb_enc_check(str, sub);
4403 if (is_broken_string(sub)) return -1;
4404
4405 str_ptr = RSTRING_PTR(str);
4406 str_ptr_end = RSTRING_END(str);
4407 str_len = RSTRING_LEN(str);
4408 sub_ptr = RSTRING_PTR(sub);
4409 sub_len = RSTRING_LEN(sub);
4410
4411 if (str_len < sub_len) return -1;
4412
4413 if (offset != 0) {
4414 long str_len_char, sub_len_char;
4415 int single_byte = single_byte_optimizable(str);
4416 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4417 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4418 if (offset < 0) {
4419 offset += str_len_char;
4420 if (offset < 0) return -1;
4421 }
4422 if (str_len_char - offset < sub_len_char) return -1;
4423 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4424 str_ptr += offset;
4425 }
4426 if (sub_len == 0) return offset;
4427
4428 /* need proceed one character at a time */
4429 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4430}
4431
4432
4433/*
4434 * call-seq:
4435 * index(substring, offset = 0) -> integer or nil
4436 * index(regexp, offset = 0) -> integer or nil
4437 *
4438 * :include: doc/string/index.rdoc
4439 *
4440 */
4441
4442static VALUE
4443rb_str_index_m(int argc, VALUE *argv, VALUE str)
4444{
4445 VALUE sub;
4446 VALUE initpos;
4447 rb_encoding *enc = STR_ENC_GET(str);
4448 long pos;
4449
4450 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4451 long slen = str_strlen(str, enc); /* str's enc */
4452 pos = NUM2LONG(initpos);
4453 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4454 if (RB_TYPE_P(sub, T_REGEXP)) {
4456 }
4457 return Qnil;
4458 }
4459 }
4460 else {
4461 pos = 0;
4462 }
4463
4464 if (RB_TYPE_P(sub, T_REGEXP)) {
4465 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4466 enc, single_byte_optimizable(str));
4467
4468 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4469 VALUE match = rb_backref_get();
4470 struct re_registers *regs = RMATCH_REGS(match);
4471 pos = rb_str_sublen(str, BEG(0));
4472 return LONG2NUM(pos);
4473 }
4474 }
4475 else {
4476 StringValue(sub);
4477 pos = rb_str_index(str, sub, pos);
4478 if (pos >= 0) {
4479 pos = rb_str_sublen(str, pos);
4480 return LONG2NUM(pos);
4481 }
4482 }
4483 return Qnil;
4484}
4485
4486/* Ensure that the given pos is a valid character boundary.
4487 * Note that in this function, "character" means a code point
4488 * (Unicode scalar value), not a grapheme cluster.
4489 */
4490static void
4491str_ensure_byte_pos(VALUE str, long pos)
4492{
4493 if (!single_byte_optimizable(str)) {
4494 const char *s = RSTRING_PTR(str);
4495 const char *e = RSTRING_END(str);
4496 const char *p = s + pos;
4497 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4498 rb_raise(rb_eIndexError,
4499 "offset %ld does not land on character boundary", pos);
4500 }
4501 }
4502}
4503
4504/*
4505 * call-seq:
4506 * byteindex(substring, offset = 0) -> integer or nil
4507 * byteindex(regexp, offset = 0) -> integer or nil
4508 *
4509 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4510 * or +nil+ if none found:
4511 *
4512 * 'foo'.byteindex('f') # => 0
4513 * 'foo'.byteindex('o') # => 1
4514 * 'foo'.byteindex('oo') # => 1
4515 * 'foo'.byteindex('ooo') # => nil
4516 *
4517 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4518 * or +nil+ if none found:
4519 *
4520 * 'foo'.byteindex(/f/) # => 0
4521 * 'foo'.byteindex(/o/) # => 1
4522 * 'foo'.byteindex(/oo/) # => 1
4523 * 'foo'.byteindex(/ooo/) # => nil
4524 *
4525 * Integer argument +offset+, if given, specifies the byte-based position in the
4526 * string to begin the search:
4527 *
4528 * 'foo'.byteindex('o', 1) # => 1
4529 * 'foo'.byteindex('o', 2) # => 2
4530 * 'foo'.byteindex('o', 3) # => nil
4531 *
4532 * If +offset+ is negative, counts backward from the end of +self+:
4533 *
4534 * 'foo'.byteindex('o', -1) # => 2
4535 * 'foo'.byteindex('o', -2) # => 1
4536 * 'foo'.byteindex('o', -3) # => 1
4537 * 'foo'.byteindex('o', -4) # => nil
4538 *
4539 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4540 * raised.
4541 *
4542 * Related: String#index, String#byterindex.
4543 */
4544
4545static VALUE
4546rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4547{
4548 VALUE sub;
4549 VALUE initpos;
4550 long pos;
4551
4552 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4553 long slen = RSTRING_LEN(str);
4554 pos = NUM2LONG(initpos);
4555 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4556 if (RB_TYPE_P(sub, T_REGEXP)) {
4558 }
4559 return Qnil;
4560 }
4561 }
4562 else {
4563 pos = 0;
4564 }
4565
4566 str_ensure_byte_pos(str, pos);
4567
4568 if (RB_TYPE_P(sub, T_REGEXP)) {
4569 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4570 VALUE match = rb_backref_get();
4571 struct re_registers *regs = RMATCH_REGS(match);
4572 pos = BEG(0);
4573 return LONG2NUM(pos);
4574 }
4575 }
4576 else {
4577 StringValue(sub);
4578 pos = rb_str_byteindex(str, sub, pos);
4579 if (pos >= 0) return LONG2NUM(pos);
4580 }
4581 return Qnil;
4582}
4583
4584#ifndef HAVE_MEMRCHR
4585static void*
4586memrchr(const char *search_str, int chr, long search_len)
4587{
4588 const char *ptr = search_str + search_len;
4589 while (ptr > search_str) {
4590 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4591 }
4592
4593 return ((void *)0);
4594}
4595#endif
4596
4597static long
4598str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4599{
4600 char *hit, *adjusted;
4601 int c;
4602 long slen, searchlen;
4603 char *sbeg, *e, *t;
4604
4605 sbeg = RSTRING_PTR(str);
4606 slen = RSTRING_LEN(sub);
4607 if (slen == 0) return s - sbeg;
4608 e = RSTRING_END(str);
4609 t = RSTRING_PTR(sub);
4610 c = *t & 0xff;
4611 searchlen = s - sbeg + 1;
4612
4613 if (memcmp(s, t, slen) == 0) {
4614 return s - sbeg;
4615 }
4616
4617 do {
4618 hit = memrchr(sbeg, c, searchlen);
4619 if (!hit) break;
4620 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4621 if (hit != adjusted) {
4622 searchlen = adjusted - sbeg;
4623 continue;
4624 }
4625 if (memcmp(hit, t, slen) == 0)
4626 return hit - sbeg;
4627 searchlen = adjusted - sbeg;
4628 } while (searchlen > 0);
4629
4630 return -1;
4631}
4632
4633/* found index in byte */
4634static long
4635rb_str_rindex(VALUE str, VALUE sub, long pos)
4636{
4637 long len, slen;
4638 char *sbeg, *s;
4639 rb_encoding *enc;
4640 int singlebyte;
4641
4642 enc = rb_enc_check(str, sub);
4643 if (is_broken_string(sub)) return -1;
4644 singlebyte = single_byte_optimizable(str);
4645 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4646 slen = str_strlen(sub, enc); /* rb_enc_check */
4647
4648 /* substring longer than string */
4649 if (len < slen) return -1;
4650 if (len - pos < slen) pos = len - slen;
4651 if (len == 0) return pos;
4652
4653 sbeg = RSTRING_PTR(str);
4654
4655 if (pos == 0) {
4656 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4657 return 0;
4658 else
4659 return -1;
4660 }
4661
4662 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4663 return str_rindex(str, sub, s, enc);
4664}
4665
4666/*
4667 * call-seq:
4668 * rindex(substring, offset = self.length) -> integer or nil
4669 * rindex(regexp, offset = self.length) -> integer or nil
4670 *
4671 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4672 * or +nil+ if none found:
4673 *
4674 * 'foo'.rindex('f') # => 0
4675 * 'foo'.rindex('o') # => 2
4676 * 'foo'.rindex('oo') # => 1
4677 * 'foo'.rindex('ooo') # => nil
4678 *
4679 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4680 * or +nil+ if none found:
4681 *
4682 * 'foo'.rindex(/f/) # => 0
4683 * 'foo'.rindex(/o/) # => 2
4684 * 'foo'.rindex(/oo/) # => 1
4685 * 'foo'.rindex(/ooo/) # => nil
4686 *
4687 * The _last_ match means starting at the possible last position, not
4688 * the last of longest matches.
4689 *
4690 * 'foo'.rindex(/o+/) # => 2
4691 * $~ #=> #<MatchData "o">
4692 *
4693 * To get the last longest match, needs to combine with negative
4694 * lookbehind.
4695 *
4696 * 'foo'.rindex(/(?<!o)o+/) # => 1
4697 * $~ #=> #<MatchData "oo">
4698 *
4699 * Or String#index with negative lookforward.
4700 *
4701 * 'foo'.index(/o+(?!.*o)/) # => 1
4702 * $~ #=> #<MatchData "oo">
4703 *
4704 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4705 * string to _end_ the search:
4706 *
4707 * 'foo'.rindex('o', 0) # => nil
4708 * 'foo'.rindex('o', 1) # => 1
4709 * 'foo'.rindex('o', 2) # => 2
4710 * 'foo'.rindex('o', 3) # => 2
4711 *
4712 * If +offset+ is a negative Integer, the maximum starting position in the
4713 * string to _end_ the search is the sum of the string's length and +offset+:
4714 *
4715 * 'foo'.rindex('o', -1) # => 2
4716 * 'foo'.rindex('o', -2) # => 1
4717 * 'foo'.rindex('o', -3) # => nil
4718 * 'foo'.rindex('o', -4) # => nil
4719 *
4720 * Related: String#index.
4721 */
4722
4723static VALUE
4724rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4725{
4726 VALUE sub;
4727 VALUE initpos;
4728 rb_encoding *enc = STR_ENC_GET(str);
4729 long pos, len = str_strlen(str, enc); /* str's enc */
4730
4731 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4732 pos = NUM2LONG(initpos);
4733 if (pos < 0 && (pos += len) < 0) {
4734 if (RB_TYPE_P(sub, T_REGEXP)) {
4736 }
4737 return Qnil;
4738 }
4739 if (pos > len) pos = len;
4740 }
4741 else {
4742 pos = len;
4743 }
4744
4745 if (RB_TYPE_P(sub, T_REGEXP)) {
4746 /* enc = rb_enc_check(str, sub); */
4747 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4748 enc, single_byte_optimizable(str));
4749
4750 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4751 VALUE match = rb_backref_get();
4752 struct re_registers *regs = RMATCH_REGS(match);
4753 pos = rb_str_sublen(str, BEG(0));
4754 return LONG2NUM(pos);
4755 }
4756 }
4757 else {
4758 StringValue(sub);
4759 pos = rb_str_rindex(str, sub, pos);
4760 if (pos >= 0) {
4761 pos = rb_str_sublen(str, pos);
4762 return LONG2NUM(pos);
4763 }
4764 }
4765 return Qnil;
4766}
4767
4768static long
4769rb_str_byterindex(VALUE str, VALUE sub, long pos)
4770{
4771 long len, slen;
4772 char *sbeg, *s;
4773 rb_encoding *enc;
4774
4775 enc = rb_enc_check(str, sub);
4776 if (is_broken_string(sub)) return -1;
4777 len = RSTRING_LEN(str);
4778 slen = RSTRING_LEN(sub);
4779
4780 /* substring longer than string */
4781 if (len < slen) return -1;
4782 if (len - pos < slen) pos = len - slen;
4783 if (len == 0) return pos;
4784
4785 sbeg = RSTRING_PTR(str);
4786
4787 if (pos == 0) {
4788 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4789 return 0;
4790 else
4791 return -1;
4792 }
4793
4794 s = sbeg + pos;
4795 return str_rindex(str, sub, s, enc);
4796}
4797
4798
4799/*
4800 * call-seq:
4801 * byterindex(substring, offset = self.bytesize) -> integer or nil
4802 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4803 *
4804 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4805 * or +nil+ if none found:
4806 *
4807 * 'foo'.byterindex('f') # => 0
4808 * 'foo'.byterindex('o') # => 2
4809 * 'foo'.byterindex('oo') # => 1
4810 * 'foo'.byterindex('ooo') # => nil
4811 *
4812 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4813 * or +nil+ if none found:
4814 *
4815 * 'foo'.byterindex(/f/) # => 0
4816 * 'foo'.byterindex(/o/) # => 2
4817 * 'foo'.byterindex(/oo/) # => 1
4818 * 'foo'.byterindex(/ooo/) # => nil
4819 *
4820 * The _last_ match means starting at the possible last position, not
4821 * the last of longest matches.
4822 *
4823 * 'foo'.byterindex(/o+/) # => 2
4824 * $~ #=> #<MatchData "o">
4825 *
4826 * To get the last longest match, needs to combine with negative
4827 * lookbehind.
4828 *
4829 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4830 * $~ #=> #<MatchData "oo">
4831 *
4832 * Or String#byteindex with negative lookforward.
4833 *
4834 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4835 * $~ #=> #<MatchData "oo">
4836 *
4837 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4838 * string to _end_ the search:
4839 *
4840 * 'foo'.byterindex('o', 0) # => nil
4841 * 'foo'.byterindex('o', 1) # => 1
4842 * 'foo'.byterindex('o', 2) # => 2
4843 * 'foo'.byterindex('o', 3) # => 2
4844 *
4845 * If +offset+ is a negative Integer, the maximum starting position in the
4846 * string to _end_ the search is the sum of the string's length and +offset+:
4847 *
4848 * 'foo'.byterindex('o', -1) # => 2
4849 * 'foo'.byterindex('o', -2) # => 1
4850 * 'foo'.byterindex('o', -3) # => nil
4851 * 'foo'.byterindex('o', -4) # => nil
4852 *
4853 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4854 * raised.
4855 *
4856 * Related: String#byteindex.
4857 */
4858
4859static VALUE
4860rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4861{
4862 VALUE sub;
4863 VALUE initpos;
4864 long pos, len = RSTRING_LEN(str);
4865
4866 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4867 pos = NUM2LONG(initpos);
4868 if (pos < 0 && (pos += len) < 0) {
4869 if (RB_TYPE_P(sub, T_REGEXP)) {
4871 }
4872 return Qnil;
4873 }
4874 if (pos > len) pos = len;
4875 }
4876 else {
4877 pos = len;
4878 }
4879
4880 str_ensure_byte_pos(str, pos);
4881
4882 if (RB_TYPE_P(sub, T_REGEXP)) {
4883 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4884 VALUE match = rb_backref_get();
4885 struct re_registers *regs = RMATCH_REGS(match);
4886 pos = BEG(0);
4887 return LONG2NUM(pos);
4888 }
4889 }
4890 else {
4891 StringValue(sub);
4892 pos = rb_str_byterindex(str, sub, pos);
4893 if (pos >= 0) return LONG2NUM(pos);
4894 }
4895 return Qnil;
4896}
4897
4898/*
4899 * call-seq:
4900 * string =~ regexp -> integer or nil
4901 * string =~ object -> integer or nil
4902 *
4903 * Returns the Integer index of the first substring that matches
4904 * the given +regexp+, or +nil+ if no match found:
4905 *
4906 * 'foo' =~ /f/ # => 0
4907 * 'foo' =~ /o/ # => 1
4908 * 'foo' =~ /x/ # => nil
4909 *
4910 * Note: also updates Regexp@Global+Variables.
4911 *
4912 * If the given +object+ is not a Regexp, returns the value
4913 * returned by <tt>object =~ self</tt>.
4914 *
4915 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4916 * (see Regexp#=~):
4917 *
4918 * number= nil
4919 * "no. 9" =~ /(?<number>\d+)/
4920 * number # => nil (not assigned)
4921 * /(?<number>\d+)/ =~ "no. 9"
4922 * number #=> "9"
4923 *
4924 */
4925
4926static VALUE
4927rb_str_match(VALUE x, VALUE y)
4928{
4929 switch (OBJ_BUILTIN_TYPE(y)) {
4930 case T_STRING:
4931 rb_raise(rb_eTypeError, "type mismatch: String given");
4932
4933 case T_REGEXP:
4934 return rb_reg_match(y, x);
4935
4936 default:
4937 return rb_funcall(y, idEqTilde, 1, x);
4938 }
4939}
4940
4941
4942static VALUE get_pat(VALUE);
4943
4944
4945/*
4946 * call-seq:
4947 * match(pattern, offset = 0) -> matchdata or nil
4948 * match(pattern, offset = 0) {|matchdata| ... } -> object
4949 *
4950 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4951 *
4952 * Note: also updates Regexp@Global+Variables.
4953 *
4954 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4955 * regexp = Regexp.new(pattern)
4956 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4957 * (see Regexp#match):
4958 * matchdata = <tt>regexp.match(self)
4959 *
4960 * With no block given, returns the computed +matchdata+:
4961 *
4962 * 'foo'.match('f') # => #<MatchData "f">
4963 * 'foo'.match('o') # => #<MatchData "o">
4964 * 'foo'.match('x') # => nil
4965 *
4966 * If Integer argument +offset+ is given, the search begins at index +offset+:
4967 *
4968 * 'foo'.match('f', 1) # => nil
4969 * 'foo'.match('o', 1) # => #<MatchData "o">
4970 *
4971 * With a block given, calls the block with the computed +matchdata+
4972 * and returns the block's return value:
4973 *
4974 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4975 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4976 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4977 *
4978 */
4979
4980static VALUE
4981rb_str_match_m(int argc, VALUE *argv, VALUE str)
4982{
4983 VALUE re, result;
4984 if (argc < 1)
4985 rb_check_arity(argc, 1, 2);
4986 re = argv[0];
4987 argv[0] = str;
4988 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4989 if (!NIL_P(result) && rb_block_given_p()) {
4990 return rb_yield(result);
4991 }
4992 return result;
4993}
4994
4995/*
4996 * call-seq:
4997 * match?(pattern, offset = 0) -> true or false
4998 *
4999 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5000 *
5001 * Note: does not update Regexp@Global+Variables.
5002 *
5003 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5004 * regexp = Regexp.new(pattern)
5005 *
5006 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5007 * +false+ otherwise:
5008 *
5009 * 'foo'.match?(/o/) # => true
5010 * 'foo'.match?('o') # => true
5011 * 'foo'.match?(/x/) # => false
5012 *
5013 * If Integer argument +offset+ is given, the search begins at index +offset+:
5014 * 'foo'.match?('f', 1) # => false
5015 * 'foo'.match?('o', 1) # => true
5016 *
5017 */
5018
5019static VALUE
5020rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5021{
5022 VALUE re;
5023 rb_check_arity(argc, 1, 2);
5024 re = get_pat(argv[0]);
5025 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5026}
5027
5028enum neighbor_char {
5029 NEIGHBOR_NOT_CHAR,
5030 NEIGHBOR_FOUND,
5031 NEIGHBOR_WRAPPED
5032};
5033
5034static enum neighbor_char
5035enc_succ_char(char *p, long len, rb_encoding *enc)
5036{
5037 long i;
5038 int l;
5039
5040 if (rb_enc_mbminlen(enc) > 1) {
5041 /* wchar, trivial case */
5042 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5043 if (!MBCLEN_CHARFOUND_P(r)) {
5044 return NEIGHBOR_NOT_CHAR;
5045 }
5046 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5047 l = rb_enc_code_to_mbclen(c, enc);
5048 if (!l) return NEIGHBOR_NOT_CHAR;
5049 if (l != len) return NEIGHBOR_WRAPPED;
5050 rb_enc_mbcput(c, p, enc);
5051 r = rb_enc_precise_mbclen(p, p + len, enc);
5052 if (!MBCLEN_CHARFOUND_P(r)) {
5053 return NEIGHBOR_NOT_CHAR;
5054 }
5055 return NEIGHBOR_FOUND;
5056 }
5057 while (1) {
5058 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5059 p[i] = '\0';
5060 if (i < 0)
5061 return NEIGHBOR_WRAPPED;
5062 ++((unsigned char*)p)[i];
5063 l = rb_enc_precise_mbclen(p, p+len, enc);
5064 if (MBCLEN_CHARFOUND_P(l)) {
5065 l = MBCLEN_CHARFOUND_LEN(l);
5066 if (l == len) {
5067 return NEIGHBOR_FOUND;
5068 }
5069 else {
5070 memset(p+l, 0xff, len-l);
5071 }
5072 }
5073 if (MBCLEN_INVALID_P(l) && i < len-1) {
5074 long len2;
5075 int l2;
5076 for (len2 = len-1; 0 < len2; len2--) {
5077 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5078 if (!MBCLEN_INVALID_P(l2))
5079 break;
5080 }
5081 memset(p+len2+1, 0xff, len-(len2+1));
5082 }
5083 }
5084}
5085
5086static enum neighbor_char
5087enc_pred_char(char *p, long len, rb_encoding *enc)
5088{
5089 long i;
5090 int l;
5091 if (rb_enc_mbminlen(enc) > 1) {
5092 /* wchar, trivial case */
5093 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5094 if (!MBCLEN_CHARFOUND_P(r)) {
5095 return NEIGHBOR_NOT_CHAR;
5096 }
5097 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5098 if (!c) return NEIGHBOR_NOT_CHAR;
5099 --c;
5100 l = rb_enc_code_to_mbclen(c, enc);
5101 if (!l) return NEIGHBOR_NOT_CHAR;
5102 if (l != len) return NEIGHBOR_WRAPPED;
5103 rb_enc_mbcput(c, p, enc);
5104 r = rb_enc_precise_mbclen(p, p + len, enc);
5105 if (!MBCLEN_CHARFOUND_P(r)) {
5106 return NEIGHBOR_NOT_CHAR;
5107 }
5108 return NEIGHBOR_FOUND;
5109 }
5110 while (1) {
5111 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5112 p[i] = '\xff';
5113 if (i < 0)
5114 return NEIGHBOR_WRAPPED;
5115 --((unsigned char*)p)[i];
5116 l = rb_enc_precise_mbclen(p, p+len, enc);
5117 if (MBCLEN_CHARFOUND_P(l)) {
5118 l = MBCLEN_CHARFOUND_LEN(l);
5119 if (l == len) {
5120 return NEIGHBOR_FOUND;
5121 }
5122 else {
5123 memset(p+l, 0, len-l);
5124 }
5125 }
5126 if (MBCLEN_INVALID_P(l) && i < len-1) {
5127 long len2;
5128 int l2;
5129 for (len2 = len-1; 0 < len2; len2--) {
5130 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5131 if (!MBCLEN_INVALID_P(l2))
5132 break;
5133 }
5134 memset(p+len2+1, 0, len-(len2+1));
5135 }
5136 }
5137}
5138
5139/*
5140 overwrite +p+ by succeeding letter in +enc+ and returns
5141 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5142 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5143 assuming each ranges are successive, and mbclen
5144 never change in each ranges.
5145 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5146 character.
5147 */
5148static enum neighbor_char
5149enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5150{
5151 enum neighbor_char ret;
5152 unsigned int c;
5153 int ctype;
5154 int range;
5155 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5156
5157 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5158 int try;
5159 const int max_gaps = 1;
5160
5161 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5162 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5163 ctype = ONIGENC_CTYPE_DIGIT;
5164 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5165 ctype = ONIGENC_CTYPE_ALPHA;
5166 else
5167 return NEIGHBOR_NOT_CHAR;
5168
5169 MEMCPY(save, p, char, len);
5170 for (try = 0; try <= max_gaps; ++try) {
5171 ret = enc_succ_char(p, len, enc);
5172 if (ret == NEIGHBOR_FOUND) {
5173 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5174 if (rb_enc_isctype(c, ctype, enc))
5175 return NEIGHBOR_FOUND;
5176 }
5177 }
5178 MEMCPY(p, save, char, len);
5179 range = 1;
5180 while (1) {
5181 MEMCPY(save, p, char, len);
5182 ret = enc_pred_char(p, len, enc);
5183 if (ret == NEIGHBOR_FOUND) {
5184 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5185 if (!rb_enc_isctype(c, ctype, enc)) {
5186 MEMCPY(p, save, char, len);
5187 break;
5188 }
5189 }
5190 else {
5191 MEMCPY(p, save, char, len);
5192 break;
5193 }
5194 range++;
5195 }
5196 if (range == 1) {
5197 return NEIGHBOR_NOT_CHAR;
5198 }
5199
5200 if (ctype != ONIGENC_CTYPE_DIGIT) {
5201 MEMCPY(carry, p, char, len);
5202 return NEIGHBOR_WRAPPED;
5203 }
5204
5205 MEMCPY(carry, p, char, len);
5206 enc_succ_char(carry, len, enc);
5207 return NEIGHBOR_WRAPPED;
5208}
5209
5210
5211static VALUE str_succ(VALUE str);
5212
5213/*
5214 * call-seq:
5215 * succ -> new_str
5216 *
5217 * Returns the successor to +self+. The successor is calculated by
5218 * incrementing characters.
5219 *
5220 * The first character to be incremented is the rightmost alphanumeric:
5221 * or, if no alphanumerics, the rightmost character:
5222 *
5223 * 'THX1138'.succ # => "THX1139"
5224 * '<<koala>>'.succ # => "<<koalb>>"
5225 * '***'.succ # => '**+'
5226 *
5227 * The successor to a digit is another digit, "carrying" to the next-left
5228 * character for a "rollover" from 9 to 0, and prepending another digit
5229 * if necessary:
5230 *
5231 * '00'.succ # => "01"
5232 * '09'.succ # => "10"
5233 * '99'.succ # => "100"
5234 *
5235 * The successor to a letter is another letter of the same case,
5236 * carrying to the next-left character for a rollover,
5237 * and prepending another same-case letter if necessary:
5238 *
5239 * 'aa'.succ # => "ab"
5240 * 'az'.succ # => "ba"
5241 * 'zz'.succ # => "aaa"
5242 * 'AA'.succ # => "AB"
5243 * 'AZ'.succ # => "BA"
5244 * 'ZZ'.succ # => "AAA"
5245 *
5246 * The successor to a non-alphanumeric character is the next character
5247 * in the underlying character set's collating sequence,
5248 * carrying to the next-left character for a rollover,
5249 * and prepending another character if necessary:
5250 *
5251 * s = 0.chr * 3
5252 * s # => "\x00\x00\x00"
5253 * s.succ # => "\x00\x00\x01"
5254 * s = 255.chr * 3
5255 * s # => "\xFF\xFF\xFF"
5256 * s.succ # => "\x01\x00\x00\x00"
5257 *
5258 * Carrying can occur between and among mixtures of alphanumeric characters:
5259 *
5260 * s = 'zz99zz99'
5261 * s.succ # => "aaa00aa00"
5262 * s = '99zz99zz'
5263 * s.succ # => "100aa00aa"
5264 *
5265 * The successor to an empty +String+ is a new empty +String+:
5266 *
5267 * ''.succ # => ""
5268 *
5269 */
5270
5271VALUE
5273{
5274 VALUE str;
5275 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5276 rb_enc_cr_str_copy_for_substr(str, orig);
5277 return str_succ(str);
5278}
5279
5280static VALUE
5281str_succ(VALUE str)
5282{
5283 rb_encoding *enc;
5284 char *sbeg, *s, *e, *last_alnum = 0;
5285 int found_alnum = 0;
5286 long l, slen;
5287 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5288 long carry_pos = 0, carry_len = 1;
5289 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5290
5291 slen = RSTRING_LEN(str);
5292 if (slen == 0) return str;
5293
5294 enc = STR_ENC_GET(str);
5295 sbeg = RSTRING_PTR(str);
5296 s = e = sbeg + slen;
5297
5298 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5299 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5300 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5301 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5302 break;
5303 }
5304 }
5305 l = rb_enc_precise_mbclen(s, e, enc);
5306 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5307 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5308 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5309 switch (neighbor) {
5310 case NEIGHBOR_NOT_CHAR:
5311 continue;
5312 case NEIGHBOR_FOUND:
5313 return str;
5314 case NEIGHBOR_WRAPPED:
5315 last_alnum = s;
5316 break;
5317 }
5318 found_alnum = 1;
5319 carry_pos = s - sbeg;
5320 carry_len = l;
5321 }
5322 if (!found_alnum) { /* str contains no alnum */
5323 s = e;
5324 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5325 enum neighbor_char neighbor;
5326 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5327 l = rb_enc_precise_mbclen(s, e, enc);
5328 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5329 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5330 MEMCPY(tmp, s, char, l);
5331 neighbor = enc_succ_char(tmp, l, enc);
5332 switch (neighbor) {
5333 case NEIGHBOR_FOUND:
5334 MEMCPY(s, tmp, char, l);
5335 return str;
5336 break;
5337 case NEIGHBOR_WRAPPED:
5338 MEMCPY(s, tmp, char, l);
5339 break;
5340 case NEIGHBOR_NOT_CHAR:
5341 break;
5342 }
5343 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5344 /* wrapped to \0...\0. search next valid char. */
5345 enc_succ_char(s, l, enc);
5346 }
5347 if (!rb_enc_asciicompat(enc)) {
5348 MEMCPY(carry, s, char, l);
5349 carry_len = l;
5350 }
5351 carry_pos = s - sbeg;
5352 }
5354 }
5355 RESIZE_CAPA(str, slen + carry_len);
5356 sbeg = RSTRING_PTR(str);
5357 s = sbeg + carry_pos;
5358 memmove(s + carry_len, s, slen - carry_pos);
5359 memmove(s, carry, carry_len);
5360 slen += carry_len;
5361 STR_SET_LEN(str, slen);
5362 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5364 return str;
5365}
5366
5367
5368/*
5369 * call-seq:
5370 * succ! -> self
5371 *
5372 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5373 */
5374
5375static VALUE
5376rb_str_succ_bang(VALUE str)
5377{
5378 rb_str_modify(str);
5379 str_succ(str);
5380 return str;
5381}
5382
5383static int
5384all_digits_p(const char *s, long len)
5385{
5386 while (len-- > 0) {
5387 if (!ISDIGIT(*s)) return 0;
5388 s++;
5389 }
5390 return 1;
5391}
5392
5393static int
5394str_upto_i(VALUE str, VALUE arg)
5395{
5396 rb_yield(str);
5397 return 0;
5398}
5399
5400/*
5401 * call-seq:
5402 * upto(other_string, exclusive = false) {|string| ... } -> self
5403 * upto(other_string, exclusive = false) -> new_enumerator
5404 *
5405 * With a block given, calls the block with each +String+ value
5406 * returned by successive calls to String#succ;
5407 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5408 * the sequence terminates when value +other_string+ is reached;
5409 * returns +self+:
5410 *
5411 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5412 * Output:
5413 *
5414 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5415 *
5416 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5417 *
5418 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5419 *
5420 * Output:
5421 *
5422 * a8 a9 b0 b1 b2 b3 b4 b5
5423 *
5424 * If +other_string+ would not be reached, does not call the block:
5425 *
5426 * '25'.upto('5') {|s| fail s }
5427 * 'aa'.upto('a') {|s| fail s }
5428 *
5429 * With no block given, returns a new Enumerator:
5430 *
5431 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5432 *
5433 */
5434
5435static VALUE
5436rb_str_upto(int argc, VALUE *argv, VALUE beg)
5437{
5438 VALUE end, exclusive;
5439
5440 rb_scan_args(argc, argv, "11", &end, &exclusive);
5441 RETURN_ENUMERATOR(beg, argc, argv);
5442 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5443}
5444
5445VALUE
5446rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5447{
5448 VALUE current, after_end;
5449 ID succ;
5450 int n, ascii;
5451 rb_encoding *enc;
5452
5453 CONST_ID(succ, "succ");
5454 StringValue(end);
5455 enc = rb_enc_check(beg, end);
5456 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5457 /* single character */
5458 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5459 char c = RSTRING_PTR(beg)[0];
5460 char e = RSTRING_PTR(end)[0];
5461
5462 if (c > e || (excl && c == e)) return beg;
5463 for (;;) {
5464 VALUE str = rb_enc_str_new(&c, 1, enc);
5466 if ((*each)(str, arg)) break;
5467 if (!excl && c == e) break;
5468 c++;
5469 if (excl && c == e) break;
5470 }
5471 return beg;
5472 }
5473 /* both edges are all digits */
5474 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5475 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5476 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5477 VALUE b, e;
5478 int width;
5479
5480 width = RSTRING_LENINT(beg);
5481 b = rb_str_to_inum(beg, 10, FALSE);
5482 e = rb_str_to_inum(end, 10, FALSE);
5483 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5484 long bi = FIX2LONG(b);
5485 long ei = FIX2LONG(e);
5486 rb_encoding *usascii = rb_usascii_encoding();
5487
5488 while (bi <= ei) {
5489 if (excl && bi == ei) break;
5490 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5491 bi++;
5492 }
5493 }
5494 else {
5495 ID op = excl ? '<' : idLE;
5496 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5497
5498 args[0] = INT2FIX(width);
5499 while (rb_funcall(b, op, 1, e)) {
5500 args[1] = b;
5501 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5502 b = rb_funcallv(b, succ, 0, 0);
5503 }
5504 }
5505 return beg;
5506 }
5507 /* normal case */
5508 n = rb_str_cmp(beg, end);
5509 if (n > 0 || (excl && n == 0)) return beg;
5510
5511 after_end = rb_funcallv(end, succ, 0, 0);
5512 current = str_duplicate(rb_cString, beg);
5513 while (!rb_str_equal(current, after_end)) {
5514 VALUE next = Qnil;
5515 if (excl || !rb_str_equal(current, end))
5516 next = rb_funcallv(current, succ, 0, 0);
5517 if ((*each)(current, arg)) break;
5518 if (NIL_P(next)) break;
5519 current = next;
5520 StringValue(current);
5521 if (excl && rb_str_equal(current, end)) break;
5522 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5523 break;
5524 }
5525
5526 return beg;
5527}
5528
5529VALUE
5530rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5531{
5532 VALUE current;
5533 ID succ;
5534
5535 CONST_ID(succ, "succ");
5536 /* both edges are all digits */
5537 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5538 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5539 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5540 int width = RSTRING_LENINT(beg);
5541 b = rb_str_to_inum(beg, 10, FALSE);
5542 if (FIXNUM_P(b)) {
5543 long bi = FIX2LONG(b);
5544 rb_encoding *usascii = rb_usascii_encoding();
5545
5546 while (FIXABLE(bi)) {
5547 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5548 bi++;
5549 }
5550 b = LONG2NUM(bi);
5551 }
5552 args[0] = INT2FIX(width);
5553 while (1) {
5554 args[1] = b;
5555 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5556 b = rb_funcallv(b, succ, 0, 0);
5557 }
5558 }
5559 /* normal case */
5560 current = str_duplicate(rb_cString, beg);
5561 while (1) {
5562 VALUE next = rb_funcallv(current, succ, 0, 0);
5563 if ((*each)(current, arg)) break;
5564 current = next;
5565 StringValue(current);
5566 if (RSTRING_LEN(current) == 0)
5567 break;
5568 }
5569
5570 return beg;
5571}
5572
5573static int
5574include_range_i(VALUE str, VALUE arg)
5575{
5576 VALUE *argp = (VALUE *)arg;
5577 if (!rb_equal(str, *argp)) return 0;
5578 *argp = Qnil;
5579 return 1;
5580}
5581
5582VALUE
5583rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5584{
5585 beg = rb_str_new_frozen(beg);
5586 StringValue(end);
5587 end = rb_str_new_frozen(end);
5588 if (NIL_P(val)) return Qfalse;
5589 val = rb_check_string_type(val);
5590 if (NIL_P(val)) return Qfalse;
5591 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5592 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5593 rb_enc_asciicompat(STR_ENC_GET(val))) {
5594 const char *bp = RSTRING_PTR(beg);
5595 const char *ep = RSTRING_PTR(end);
5596 const char *vp = RSTRING_PTR(val);
5597 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5598 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5599 return Qfalse;
5600 else {
5601 char b = *bp;
5602 char e = *ep;
5603 char v = *vp;
5604
5605 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5606 if (b <= v && v < e) return Qtrue;
5607 return RBOOL(!RTEST(exclusive) && v == e);
5608 }
5609 }
5610 }
5611#if 0
5612 /* both edges are all digits */
5613 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5614 all_digits_p(bp, RSTRING_LEN(beg)) &&
5615 all_digits_p(ep, RSTRING_LEN(end))) {
5616 /* TODO */
5617 }
5618#endif
5619 }
5620 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5621
5622 return RBOOL(NIL_P(val));
5623}
5624
5625static VALUE
5626rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5627{
5628 if (rb_reg_search(re, str, 0, 0) >= 0) {
5629 VALUE match = rb_backref_get();
5630 int nth = rb_reg_backref_number(match, backref);
5631 return rb_reg_nth_match(nth, match);
5632 }
5633 return Qnil;
5634}
5635
5636static VALUE
5637rb_str_aref(VALUE str, VALUE indx)
5638{
5639 long idx;
5640
5641 if (FIXNUM_P(indx)) {
5642 idx = FIX2LONG(indx);
5643 }
5644 else if (RB_TYPE_P(indx, T_REGEXP)) {
5645 return rb_str_subpat(str, indx, INT2FIX(0));
5646 }
5647 else if (RB_TYPE_P(indx, T_STRING)) {
5648 if (rb_str_index(str, indx, 0) != -1)
5649 return str_duplicate(rb_cString, indx);
5650 return Qnil;
5651 }
5652 else {
5653 /* check if indx is Range */
5654 long beg, len = str_strlen(str, NULL);
5655 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5656 case Qfalse:
5657 break;
5658 case Qnil:
5659 return Qnil;
5660 default:
5661 return rb_str_substr(str, beg, len);
5662 }
5663 idx = NUM2LONG(indx);
5664 }
5665
5666 return str_substr(str, idx, 1, FALSE);
5667}
5668
5669
5670/*
5671 * call-seq:
5672 * string[index] -> new_string or nil
5673 * string[start, length] -> new_string or nil
5674 * string[range] -> new_string or nil
5675 * string[regexp, capture = 0] -> new_string or nil
5676 * string[substring] -> new_string or nil
5677 *
5678 * Returns the substring of +self+ specified by the arguments.
5679 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5680 *
5681 *
5682 */
5683
5684static VALUE
5685rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5686{
5687 if (argc == 2) {
5688 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5689 return rb_str_subpat(str, argv[0], argv[1]);
5690 }
5691 else {
5692 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5693 }
5694 }
5695 rb_check_arity(argc, 1, 2);
5696 return rb_str_aref(str, argv[0]);
5697}
5698
5699VALUE
5701{
5702 char *ptr = RSTRING_PTR(str);
5703 long olen = RSTRING_LEN(str), nlen;
5704
5705 str_modifiable(str);
5706 if (len > olen) len = olen;
5707 nlen = olen - len;
5708 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5709 char *oldptr = ptr;
5710 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5711 STR_SET_EMBED(str);
5712 ptr = RSTRING(str)->as.embed.ary;
5713 memmove(ptr, oldptr + len, nlen);
5714 if (fl == STR_NOEMBED) xfree(oldptr);
5715 }
5716 else {
5717 if (!STR_SHARED_P(str)) {
5718 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5719 rb_enc_cr_str_exact_copy(shared, str);
5720 OBJ_FREEZE(shared);
5721 }
5722 ptr = RSTRING(str)->as.heap.ptr += len;
5723 }
5724 STR_SET_LEN(str, nlen);
5725
5726 if (!SHARABLE_MIDDLE_SUBSTRING) {
5727 TERM_FILL(ptr + nlen, TERM_LEN(str));
5728 }
5730 return str;
5731}
5732
5733static void
5734rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5735{
5736 char *sptr;
5737 long slen;
5738 int cr;
5739
5740 if (beg == 0 && vlen == 0) {
5741 rb_str_drop_bytes(str, len);
5742 return;
5743 }
5744
5745 str_modify_keep_cr(str);
5746 RSTRING_GETMEM(str, sptr, slen);
5747 if (len < vlen) {
5748 /* expand string */
5749 RESIZE_CAPA(str, slen + vlen - len);
5750 sptr = RSTRING_PTR(str);
5751 }
5752
5754 cr = rb_enc_str_coderange(val);
5755 else
5757
5758 if (vlen != len) {
5759 memmove(sptr + beg + vlen,
5760 sptr + beg + len,
5761 slen - (beg + len));
5762 }
5763 if (vlen < beg && len < 0) {
5764 MEMZERO(sptr + slen, char, -len);
5765 }
5766 if (vlen > 0) {
5767 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5768 }
5769 slen += vlen - len;
5770 STR_SET_LEN(str, slen);
5771 TERM_FILL(&sptr[slen], TERM_LEN(str));
5772 ENC_CODERANGE_SET(str, cr);
5773}
5774
5775static inline void
5776rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5777{
5778 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5779}
5780
5781void
5782rb_str_update(VALUE str, long beg, long len, VALUE val)
5783{
5784 long slen;
5785 char *p, *e;
5786 rb_encoding *enc;
5787 int singlebyte = single_byte_optimizable(str);
5788 int cr;
5789
5790 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5791
5792 StringValue(val);
5793 enc = rb_enc_check(str, val);
5794 slen = str_strlen(str, enc); /* rb_enc_check */
5795
5796 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5797 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5798 }
5799 if (beg < 0) {
5800 beg += slen;
5801 }
5802 RUBY_ASSERT(beg >= 0);
5803 RUBY_ASSERT(beg <= slen);
5804
5805 if (len > slen - beg) {
5806 len = slen - beg;
5807 }
5808 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5809 if (!p) p = RSTRING_END(str);
5810 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5811 if (!e) e = RSTRING_END(str);
5812 /* error check */
5813 beg = p - RSTRING_PTR(str); /* physical position */
5814 len = e - p; /* physical length */
5815 rb_str_update_0(str, beg, len, val);
5816 rb_enc_associate(str, enc);
5818 if (cr != ENC_CODERANGE_BROKEN)
5819 ENC_CODERANGE_SET(str, cr);
5820}
5821
5822static void
5823rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5824{
5825 int nth;
5826 VALUE match;
5827 long start, end, len;
5828 rb_encoding *enc;
5829 struct re_registers *regs;
5830
5831 if (rb_reg_search(re, str, 0, 0) < 0) {
5832 rb_raise(rb_eIndexError, "regexp not matched");
5833 }
5834 match = rb_backref_get();
5835 nth = rb_reg_backref_number(match, backref);
5836 regs = RMATCH_REGS(match);
5837 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5838 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5839 }
5840 if (nth < 0) {
5841 nth += regs->num_regs;
5842 }
5843
5844 start = BEG(nth);
5845 if (start == -1) {
5846 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5847 }
5848 end = END(nth);
5849 len = end - start;
5850 StringValue(val);
5851 enc = rb_enc_check_str(str, val);
5852 rb_str_update_0(str, start, len, val);
5853 rb_enc_associate(str, enc);
5854}
5855
5856static VALUE
5857rb_str_aset(VALUE str, VALUE indx, VALUE val)
5858{
5859 long idx, beg;
5860
5861 switch (TYPE(indx)) {
5862 case T_REGEXP:
5863 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5864 return val;
5865
5866 case T_STRING:
5867 beg = rb_str_index(str, indx, 0);
5868 if (beg < 0) {
5869 rb_raise(rb_eIndexError, "string not matched");
5870 }
5871 beg = rb_str_sublen(str, beg);
5872 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5873 return val;
5874
5875 default:
5876 /* check if indx is Range */
5877 {
5878 long beg, len;
5879 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5880 rb_str_update(str, beg, len, val);
5881 return val;
5882 }
5883 }
5884 /* FALLTHROUGH */
5885
5886 case T_FIXNUM:
5887 idx = NUM2LONG(indx);
5888 rb_str_update(str, idx, 1, val);
5889 return val;
5890 }
5891}
5892
5893/*
5894 * call-seq:
5895 * string[index] = new_string
5896 * string[start, length] = new_string
5897 * string[range] = new_string
5898 * string[regexp, capture = 0] = new_string
5899 * string[substring] = new_string
5900 *
5901 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5902 * See {String Slices}[rdoc-ref:String@String+Slices].
5903 *
5904 * A few examples:
5905 *
5906 * s = 'foo'
5907 * s[2] = 'rtune' # => "rtune"
5908 * s # => "fortune"
5909 * s[1, 5] = 'init' # => "init"
5910 * s # => "finite"
5911 * s[3..4] = 'al' # => "al"
5912 * s # => "finale"
5913 * s[/e$/] = 'ly' # => "ly"
5914 * s # => "finally"
5915 * s['lly'] = 'ncial' # => "ncial"
5916 * s # => "financial"
5917 *
5918 */
5919
5920static VALUE
5921rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5922{
5923 if (argc == 3) {
5924 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5925 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5926 }
5927 else {
5928 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5929 }
5930 return argv[2];
5931 }
5932 rb_check_arity(argc, 2, 3);
5933 return rb_str_aset(str, argv[0], argv[1]);
5934}
5935
5936/*
5937 * call-seq:
5938 * insert(index, other_string) -> self
5939 *
5940 * Inserts the given +other_string+ into +self+; returns +self+.
5941 *
5942 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5943 *
5944 * 'foo'.insert(1, 'bar') # => "fbaroo"
5945 *
5946 * If the Integer +index+ is negative, counts backward from the end of +self+
5947 * and inserts +other_string+ at offset <tt>index+1</tt>
5948 * (that is, _after_ <tt>self[index]</tt>):
5949 *
5950 * 'foo'.insert(-2, 'bar') # => "fobaro"
5951 *
5952 */
5953
5954static VALUE
5955rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5956{
5957 long pos = NUM2LONG(idx);
5958
5959 if (pos == -1) {
5960 return rb_str_append(str, str2);
5961 }
5962 else if (pos < 0) {
5963 pos++;
5964 }
5965 rb_str_update(str, pos, 0, str2);
5966 return str;
5967}
5968
5969
5970/*
5971 * call-seq:
5972 * slice!(index) -> new_string or nil
5973 * slice!(start, length) -> new_string or nil
5974 * slice!(range) -> new_string or nil
5975 * slice!(regexp, capture = 0) -> new_string or nil
5976 * slice!(substring) -> new_string or nil
5977 *
5978 * Removes and returns the substring of +self+ specified by the arguments.
5979 * See {String Slices}[rdoc-ref:String@String+Slices].
5980 *
5981 * A few examples:
5982 *
5983 * string = "This is a string"
5984 * string.slice!(2) #=> "i"
5985 * string.slice!(3..6) #=> " is "
5986 * string.slice!(/s.*t/) #=> "sa st"
5987 * string.slice!("r") #=> "r"
5988 * string #=> "Thing"
5989 *
5990 */
5991
5992static VALUE
5993rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5994{
5995 VALUE result = Qnil;
5996 VALUE indx;
5997 long beg, len = 1;
5998 char *p;
5999
6000 rb_check_arity(argc, 1, 2);
6001 str_modify_keep_cr(str);
6002 indx = argv[0];
6003 if (RB_TYPE_P(indx, T_REGEXP)) {
6004 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6005 VALUE match = rb_backref_get();
6006 struct re_registers *regs = RMATCH_REGS(match);
6007 int nth = 0;
6008 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6009 if ((nth += regs->num_regs) <= 0) return Qnil;
6010 }
6011 else if (nth >= regs->num_regs) return Qnil;
6012 beg = BEG(nth);
6013 len = END(nth) - beg;
6014 goto subseq;
6015 }
6016 else if (argc == 2) {
6017 beg = NUM2LONG(indx);
6018 len = NUM2LONG(argv[1]);
6019 goto num_index;
6020 }
6021 else if (FIXNUM_P(indx)) {
6022 beg = FIX2LONG(indx);
6023 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6024 if (!len) return Qnil;
6025 beg = p - RSTRING_PTR(str);
6026 goto subseq;
6027 }
6028 else if (RB_TYPE_P(indx, T_STRING)) {
6029 beg = rb_str_index(str, indx, 0);
6030 if (beg == -1) return Qnil;
6031 len = RSTRING_LEN(indx);
6032 result = str_duplicate(rb_cString, indx);
6033 goto squash;
6034 }
6035 else {
6036 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6037 case Qnil:
6038 return Qnil;
6039 case Qfalse:
6040 beg = NUM2LONG(indx);
6041 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6042 if (!len) return Qnil;
6043 beg = p - RSTRING_PTR(str);
6044 goto subseq;
6045 default:
6046 goto num_index;
6047 }
6048 }
6049
6050 num_index:
6051 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6052 beg = p - RSTRING_PTR(str);
6053
6054 subseq:
6055 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6056 rb_enc_cr_str_copy_for_substr(result, str);
6057
6058 squash:
6059 if (len > 0) {
6060 if (beg == 0) {
6061 rb_str_drop_bytes(str, len);
6062 }
6063 else {
6064 char *sptr = RSTRING_PTR(str);
6065 long slen = RSTRING_LEN(str);
6066 if (beg + len > slen) /* pathological check */
6067 len = slen - beg;
6068 memmove(sptr + beg,
6069 sptr + beg + len,
6070 slen - (beg + len));
6071 slen -= len;
6072 STR_SET_LEN(str, slen);
6073 TERM_FILL(&sptr[slen], TERM_LEN(str));
6074 }
6075 }
6076 return result;
6077}
6078
6079static VALUE
6080get_pat(VALUE pat)
6081{
6082 VALUE val;
6083
6084 switch (OBJ_BUILTIN_TYPE(pat)) {
6085 case T_REGEXP:
6086 return pat;
6087
6088 case T_STRING:
6089 break;
6090
6091 default:
6092 val = rb_check_string_type(pat);
6093 if (NIL_P(val)) {
6094 Check_Type(pat, T_REGEXP);
6095 }
6096 pat = val;
6097 }
6098
6099 return rb_reg_regcomp(pat);
6100}
6101
6102static VALUE
6103get_pat_quoted(VALUE pat, int check)
6104{
6105 VALUE val;
6106
6107 switch (OBJ_BUILTIN_TYPE(pat)) {
6108 case T_REGEXP:
6109 return pat;
6110
6111 case T_STRING:
6112 break;
6113
6114 default:
6115 val = rb_check_string_type(pat);
6116 if (NIL_P(val)) {
6117 Check_Type(pat, T_REGEXP);
6118 }
6119 pat = val;
6120 }
6121 if (check && is_broken_string(pat)) {
6122 rb_exc_raise(rb_reg_check_preprocess(pat));
6123 }
6124 return pat;
6125}
6126
6127static long
6128rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6129{
6130 if (BUILTIN_TYPE(pat) == T_STRING) {
6131 pos = rb_str_byteindex(str, pat, pos);
6132 if (set_backref_str) {
6133 if (pos >= 0) {
6134 str = rb_str_new_frozen_String(str);
6135 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6136 }
6137 else {
6139 }
6140 }
6141 return pos;
6142 }
6143 else {
6144 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6145 }
6146}
6147
6148
6149/*
6150 * call-seq:
6151 * sub!(pattern, replacement) -> self or nil
6152 * sub!(pattern) {|match| ... } -> self or nil
6153 *
6154 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6155 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6156 *
6157 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6158 *
6159 * Related: String#sub, String#gsub, String#gsub!.
6160 *
6161 */
6162
6163static VALUE
6164rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6165{
6166 VALUE pat, repl, hash = Qnil;
6167 int iter = 0;
6168 long plen;
6169 int min_arity = rb_block_given_p() ? 1 : 2;
6170 long beg;
6171
6172 rb_check_arity(argc, min_arity, 2);
6173 if (argc == 1) {
6174 iter = 1;
6175 }
6176 else {
6177 repl = argv[1];
6178 hash = rb_check_hash_type(argv[1]);
6179 if (NIL_P(hash)) {
6180 StringValue(repl);
6181 }
6182 }
6183
6184 pat = get_pat_quoted(argv[0], 1);
6185
6186 str_modifiable(str);
6187 beg = rb_pat_search(pat, str, 0, 1);
6188 if (beg >= 0) {
6189 rb_encoding *enc;
6190 int cr = ENC_CODERANGE(str);
6191 long beg0, end0;
6192 VALUE match, match0 = Qnil;
6193 struct re_registers *regs;
6194 char *p, *rp;
6195 long len, rlen;
6196
6197 match = rb_backref_get();
6198 regs = RMATCH_REGS(match);
6199 if (RB_TYPE_P(pat, T_STRING)) {
6200 beg0 = beg;
6201 end0 = beg0 + RSTRING_LEN(pat);
6202 match0 = pat;
6203 }
6204 else {
6205 beg0 = BEG(0);
6206 end0 = END(0);
6207 if (iter) match0 = rb_reg_nth_match(0, match);
6208 }
6209
6210 if (iter || !NIL_P(hash)) {
6211 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6212
6213 if (iter) {
6214 repl = rb_obj_as_string(rb_yield(match0));
6215 }
6216 else {
6217 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6218 repl = rb_obj_as_string(repl);
6219 }
6220 str_mod_check(str, p, len);
6221 rb_check_frozen(str);
6222 }
6223 else {
6224 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6225 }
6226
6227 enc = rb_enc_compatible(str, repl);
6228 if (!enc) {
6229 rb_encoding *str_enc = STR_ENC_GET(str);
6230 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6231 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6232 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6233 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6234 rb_enc_inspect_name(str_enc),
6235 rb_enc_inspect_name(STR_ENC_GET(repl)));
6236 }
6237 enc = STR_ENC_GET(repl);
6238 }
6239 rb_str_modify(str);
6240 rb_enc_associate(str, enc);
6242 int cr2 = ENC_CODERANGE(repl);
6243 if (cr2 == ENC_CODERANGE_BROKEN ||
6244 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6246 else
6247 cr = cr2;
6248 }
6249 plen = end0 - beg0;
6250 rlen = RSTRING_LEN(repl);
6251 len = RSTRING_LEN(str);
6252 if (rlen > plen) {
6253 RESIZE_CAPA(str, len + rlen - plen);
6254 }
6255 p = RSTRING_PTR(str);
6256 if (rlen != plen) {
6257 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6258 }
6259 rp = RSTRING_PTR(repl);
6260 memmove(p + beg0, rp, rlen);
6261 len += rlen - plen;
6262 STR_SET_LEN(str, len);
6263 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6264 ENC_CODERANGE_SET(str, cr);
6265
6266 RB_GC_GUARD(match);
6267
6268 return str;
6269 }
6270 return Qnil;
6271}
6272
6273
6274/*
6275 * call-seq:
6276 * sub(pattern, replacement) -> new_string
6277 * sub(pattern) {|match| ... } -> new_string
6278 *
6279 * Returns a copy of +self+ with only the first occurrence
6280 * (not all occurrences) of the given +pattern+ replaced.
6281 *
6282 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6283 *
6284 * Related: String#sub!, String#gsub, String#gsub!.
6285 *
6286 */
6287
6288static VALUE
6289rb_str_sub(int argc, VALUE *argv, VALUE str)
6290{
6291 str = str_duplicate(rb_cString, str);
6292 rb_str_sub_bang(argc, argv, str);
6293 return str;
6294}
6295
6296static VALUE
6297str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6298{
6299 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6300 long beg, beg0, end0;
6301 long offset, blen, slen, len, last;
6302 enum {STR, ITER, MAP} mode = STR;
6303 char *sp, *cp;
6304 int need_backref = -1;
6305 rb_encoding *str_enc;
6306
6307 switch (argc) {
6308 case 1:
6309 RETURN_ENUMERATOR(str, argc, argv);
6310 mode = ITER;
6311 break;
6312 case 2:
6313 repl = argv[1];
6314 hash = rb_check_hash_type(argv[1]);
6315 if (NIL_P(hash)) {
6316 StringValue(repl);
6317 }
6318 else {
6319 mode = MAP;
6320 }
6321 break;
6322 default:
6323 rb_error_arity(argc, 1, 2);
6324 }
6325
6326 pat = get_pat_quoted(argv[0], 1);
6327 beg = rb_pat_search(pat, str, 0, need_backref);
6328 if (beg < 0) {
6329 if (bang) return Qnil; /* no match, no substitution */
6330 return str_duplicate(rb_cString, str);
6331 }
6332
6333 offset = 0;
6334 blen = RSTRING_LEN(str) + 30; /* len + margin */
6335 dest = rb_str_buf_new(blen);
6336 sp = RSTRING_PTR(str);
6337 slen = RSTRING_LEN(str);
6338 cp = sp;
6339 str_enc = STR_ENC_GET(str);
6340 rb_enc_associate(dest, str_enc);
6341 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6342
6343 do {
6344 VALUE match = rb_backref_get();
6345 struct re_registers *regs = RMATCH_REGS(match);
6346 if (RB_TYPE_P(pat, T_STRING)) {
6347 beg0 = beg;
6348 end0 = beg0 + RSTRING_LEN(pat);
6349 match0 = pat;
6350 }
6351 else {
6352 beg0 = BEG(0);
6353 end0 = END(0);
6354 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6355 }
6356
6357 if (mode) {
6358 if (mode == ITER) {
6359 val = rb_obj_as_string(rb_yield(match0));
6360 }
6361 else {
6362 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6363 val = rb_obj_as_string(val);
6364 }
6365 str_mod_check(str, sp, slen);
6366 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6367 rb_raise(rb_eRuntimeError, "block should not cheat");
6368 }
6369 }
6370 else if (need_backref) {
6371 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6372 if (need_backref < 0) {
6373 need_backref = val != repl;
6374 }
6375 }
6376 else {
6377 val = repl;
6378 }
6379
6380 len = beg0 - offset; /* copy pre-match substr */
6381 if (len) {
6382 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6383 }
6384
6385 rb_str_buf_append(dest, val);
6386
6387 last = offset;
6388 offset = end0;
6389 if (beg0 == end0) {
6390 /*
6391 * Always consume at least one character of the input string
6392 * in order to prevent infinite loops.
6393 */
6394 if (RSTRING_LEN(str) <= end0) break;
6395 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6396 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6397 offset = end0 + len;
6398 }
6399 cp = RSTRING_PTR(str) + offset;
6400 if (offset > RSTRING_LEN(str)) break;
6401 beg = rb_pat_search(pat, str, offset, need_backref);
6402
6403 RB_GC_GUARD(match);
6404 } while (beg >= 0);
6405 if (RSTRING_LEN(str) > offset) {
6406 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6407 }
6408 rb_pat_search(pat, str, last, 1);
6409 if (bang) {
6410 str_shared_replace(str, dest);
6411 }
6412 else {
6413 str = dest;
6414 }
6415
6416 return str;
6417}
6418
6419
6420/*
6421 * call-seq:
6422 * gsub!(pattern, replacement) -> self or nil
6423 * gsub!(pattern) {|match| ... } -> self or nil
6424 * gsub!(pattern) -> an_enumerator
6425 *
6426 * Performs the specified substring replacement(s) on +self+;
6427 * returns +self+ if any replacement occurred, +nil+ otherwise.
6428 *
6429 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6430 *
6431 * Returns an Enumerator if no +replacement+ and no block given.
6432 *
6433 * Related: String#sub, String#gsub, String#sub!.
6434 *
6435 */
6436
6437static VALUE
6438rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6439{
6440 str_modify_keep_cr(str);
6441 return str_gsub(argc, argv, str, 1);
6442}
6443
6444
6445/*
6446 * call-seq:
6447 * gsub(pattern, replacement) -> new_string
6448 * gsub(pattern) {|match| ... } -> new_string
6449 * gsub(pattern) -> enumerator
6450 *
6451 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6452 *
6453 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6454 *
6455 * Returns an Enumerator if no +replacement+ and no block given.
6456 *
6457 * Related: String#sub, String#sub!, String#gsub!.
6458 *
6459 */
6460
6461static VALUE
6462rb_str_gsub(int argc, VALUE *argv, VALUE str)
6463{
6464 return str_gsub(argc, argv, str, 0);
6465}
6466
6467
6468/*
6469 * call-seq:
6470 * replace(other_string) -> self
6471 *
6472 * Replaces the contents of +self+ with the contents of +other_string+:
6473 *
6474 * s = 'foo' # => "foo"
6475 * s.replace('bar') # => "bar"
6476 *
6477 */
6478
6479VALUE
6481{
6482 str_modifiable(str);
6483 if (str == str2) return str;
6484
6485 StringValue(str2);
6486 str_discard(str);
6487 return str_replace(str, str2);
6488}
6489
6490/*
6491 * call-seq:
6492 * clear -> self
6493 *
6494 * Removes the contents of +self+:
6495 *
6496 * s = 'foo' # => "foo"
6497 * s.clear # => ""
6498 *
6499 */
6500
6501static VALUE
6502rb_str_clear(VALUE str)
6503{
6504 str_discard(str);
6505 STR_SET_EMBED(str);
6506 STR_SET_LEN(str, 0);
6507 RSTRING_PTR(str)[0] = 0;
6508 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6510 else
6512 return str;
6513}
6514
6515/*
6516 * call-seq:
6517 * chr -> string
6518 *
6519 * Returns a string containing the first character of +self+:
6520 *
6521 * s = 'foo' # => "foo"
6522 * s.chr # => "f"
6523 *
6524 */
6525
6526static VALUE
6527rb_str_chr(VALUE str)
6528{
6529 return rb_str_substr(str, 0, 1);
6530}
6531
6532/*
6533 * call-seq:
6534 * getbyte(index) -> integer or nil
6535 *
6536 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6537 *
6538 * s = 'abcde' # => "abcde"
6539 * s.getbyte(0) # => 97
6540 * s.getbyte(-1) # => 101
6541 * s.getbyte(5) # => nil
6542 *
6543 * Related: String#setbyte.
6544 */
6545VALUE
6546rb_str_getbyte(VALUE str, VALUE index)
6547{
6548 long pos = NUM2LONG(index);
6549
6550 if (pos < 0)
6551 pos += RSTRING_LEN(str);
6552 if (pos < 0 || RSTRING_LEN(str) <= pos)
6553 return Qnil;
6554
6555 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6556}
6557
6558/*
6559 * call-seq:
6560 * setbyte(index, integer) -> integer
6561 *
6562 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6563 *
6564 * s = 'abcde' # => "abcde"
6565 * s.setbyte(0, 98) # => 98
6566 * s # => "bbcde"
6567 *
6568 * Related: String#getbyte.
6569 */
6570VALUE
6571rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6572{
6573 long pos = NUM2LONG(index);
6574 long len = RSTRING_LEN(str);
6575 char *ptr, *head, *left = 0;
6576 rb_encoding *enc;
6577 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6578
6579 if (pos < -len || len <= pos)
6580 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6581 if (pos < 0)
6582 pos += len;
6583
6584 VALUE v = rb_to_int(value);
6585 VALUE w = rb_int_and(v, INT2FIX(0xff));
6586 char byte = (char)(NUM2INT(w) & 0xFF);
6587
6588 if (!str_independent(str))
6589 str_make_independent(str);
6590 enc = STR_ENC_GET(str);
6591 head = RSTRING_PTR(str);
6592 ptr = &head[pos];
6593 if (!STR_EMBED_P(str)) {
6594 cr = ENC_CODERANGE(str);
6595 switch (cr) {
6596 case ENC_CODERANGE_7BIT:
6597 left = ptr;
6598 *ptr = byte;
6599 if (ISASCII(byte)) goto end;
6600 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6601 if (!MBCLEN_CHARFOUND_P(nlen))
6603 else
6605 goto end;
6607 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6608 width = rb_enc_precise_mbclen(left, head+len, enc);
6609 *ptr = byte;
6610 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6611 if (!MBCLEN_CHARFOUND_P(nlen))
6613 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6615 goto end;
6616 }
6617 }
6619 *ptr = byte;
6620
6621 end:
6622 return value;
6623}
6624
6625static VALUE
6626str_byte_substr(VALUE str, long beg, long len, int empty)
6627{
6628 long n = RSTRING_LEN(str);
6629
6630 if (beg > n || len < 0) return Qnil;
6631 if (beg < 0) {
6632 beg += n;
6633 if (beg < 0) return Qnil;
6634 }
6635 if (len > n - beg)
6636 len = n - beg;
6637 if (len <= 0) {
6638 if (!empty) return Qnil;
6639 len = 0;
6640 }
6641
6642 VALUE str2 = str_subseq(str, beg, len);
6643
6644 str_enc_copy_direct(str2, str);
6645
6646 if (RSTRING_LEN(str2) == 0) {
6647 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6649 else
6651 }
6652 else {
6653 switch (ENC_CODERANGE(str)) {
6654 case ENC_CODERANGE_7BIT:
6656 break;
6657 default:
6659 break;
6660 }
6661 }
6662
6663 return str2;
6664}
6665
6666VALUE
6667rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6668{
6669 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6670}
6671
6672static VALUE
6673str_byte_aref(VALUE str, VALUE indx)
6674{
6675 long idx;
6676 if (FIXNUM_P(indx)) {
6677 idx = FIX2LONG(indx);
6678 }
6679 else {
6680 /* check if indx is Range */
6681 long beg, len = RSTRING_LEN(str);
6682
6683 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6684 case Qfalse:
6685 break;
6686 case Qnil:
6687 return Qnil;
6688 default:
6689 return str_byte_substr(str, beg, len, TRUE);
6690 }
6691
6692 idx = NUM2LONG(indx);
6693 }
6694 return str_byte_substr(str, idx, 1, FALSE);
6695}
6696
6697/*
6698 * call-seq:
6699 * byteslice(index, length = 1) -> string or nil
6700 * byteslice(range) -> string or nil
6701 *
6702 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6703 *
6704 * With integer arguments +index+ and +length+ given,
6705 * returns the substring beginning at the given +index+
6706 * of the given +length+ (if possible),
6707 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6708 *
6709 * s = '0123456789' # => "0123456789"
6710 * s.byteslice(2) # => "2"
6711 * s.byteslice(200) # => nil
6712 * s.byteslice(4, 3) # => "456"
6713 * s.byteslice(4, 30) # => "456789"
6714 * s.byteslice(4, -1) # => nil
6715 * s.byteslice(40, 2) # => nil
6716 *
6717 * In either case above, counts backwards from the end of +self+
6718 * if +index+ is negative:
6719 *
6720 * s = '0123456789' # => "0123456789"
6721 * s.byteslice(-4) # => "6"
6722 * s.byteslice(-4, 3) # => "678"
6723 *
6724 * With Range argument +range+ given, returns
6725 * <tt>byteslice(range.begin, range.size)</tt>:
6726 *
6727 * s = '0123456789' # => "0123456789"
6728 * s.byteslice(4..6) # => "456"
6729 * s.byteslice(-6..-4) # => "456"
6730 * s.byteslice(5..2) # => "" # range.size is zero.
6731 * s.byteslice(40..42) # => nil
6732 *
6733 * In all cases, a returned string has the same encoding as +self+:
6734 *
6735 * s.encoding # => #<Encoding:UTF-8>
6736 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6737 *
6738 */
6739
6740static VALUE
6741rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6742{
6743 if (argc == 2) {
6744 long beg = NUM2LONG(argv[0]);
6745 long len = NUM2LONG(argv[1]);
6746 return str_byte_substr(str, beg, len, TRUE);
6747 }
6748 rb_check_arity(argc, 1, 2);
6749 return str_byte_aref(str, argv[0]);
6750}
6751
6752static void
6753str_check_beg_len(VALUE str, long *beg, long *len)
6754{
6755 long end, slen = RSTRING_LEN(str);
6756
6757 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6758 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6759 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6760 }
6761 if (*beg < 0) {
6762 *beg += slen;
6763 }
6764 RUBY_ASSERT(*beg >= 0);
6765 RUBY_ASSERT(*beg <= slen);
6766
6767 if (*len > slen - *beg) {
6768 *len = slen - *beg;
6769 }
6770 end = *beg + *len;
6771 str_ensure_byte_pos(str, *beg);
6772 str_ensure_byte_pos(str, end);
6773}
6774
6775/*
6776 * call-seq:
6777 * bytesplice(index, length, str) -> string
6778 * bytesplice(index, length, str, str_index, str_length) -> string
6779 * bytesplice(range, str) -> string
6780 * bytesplice(range, str, str_range) -> string
6781 *
6782 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6783 * The portion of the string affected is determined using
6784 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6785 * If the replacement string is not the same length as the text it is replacing,
6786 * the string will be adjusted accordingly.
6787 *
6788 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6789 *
6790 * The form that take an Integer will raise an IndexError if the value is out
6791 * of range; the Range form will raise a RangeError.
6792 * If the beginning or ending offset does not land on character (codepoint)
6793 * boundary, an IndexError will be raised.
6794 */
6795
6796static VALUE
6797rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6798{
6799 long beg, len, vbeg, vlen;
6800 VALUE val;
6801 int cr;
6802
6803 rb_check_arity(argc, 2, 5);
6804 if (!(argc == 2 || argc == 3 || argc == 5)) {
6805 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6806 }
6807 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6808 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6809 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6810 rb_builtin_class_name(argv[0]));
6811 }
6812 val = argv[1];
6813 StringValue(val);
6814 if (argc == 2) {
6815 /* bytesplice(range, str) */
6816 vbeg = 0;
6817 vlen = RSTRING_LEN(val);
6818 }
6819 else {
6820 /* bytesplice(range, str, str_range) */
6821 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6822 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6823 rb_builtin_class_name(argv[2]));
6824 }
6825 }
6826 }
6827 else {
6828 beg = NUM2LONG(argv[0]);
6829 len = NUM2LONG(argv[1]);
6830 val = argv[2];
6831 StringValue(val);
6832 if (argc == 3) {
6833 /* bytesplice(index, length, str) */
6834 vbeg = 0;
6835 vlen = RSTRING_LEN(val);
6836 }
6837 else {
6838 /* bytesplice(index, length, str, str_index, str_length) */
6839 vbeg = NUM2LONG(argv[3]);
6840 vlen = NUM2LONG(argv[4]);
6841 }
6842 }
6843 str_check_beg_len(str, &beg, &len);
6844 str_check_beg_len(val, &vbeg, &vlen);
6845 str_modify_keep_cr(str);
6846
6847 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6848 rb_enc_associate(str, rb_enc_check(str, val));
6849 }
6850
6851 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6853 if (cr != ENC_CODERANGE_BROKEN)
6854 ENC_CODERANGE_SET(str, cr);
6855 return str;
6856}
6857
6858/*
6859 * call-seq:
6860 * reverse -> string
6861 *
6862 * Returns a new string with the characters from +self+ in reverse order.
6863 *
6864 * 'stressed'.reverse # => "desserts"
6865 *
6866 */
6867
6868static VALUE
6869rb_str_reverse(VALUE str)
6870{
6871 rb_encoding *enc;
6872 VALUE rev;
6873 char *s, *e, *p;
6874 int cr;
6875
6876 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6877 enc = STR_ENC_GET(str);
6878 rev = rb_str_new(0, RSTRING_LEN(str));
6879 s = RSTRING_PTR(str); e = RSTRING_END(str);
6880 p = RSTRING_END(rev);
6881 cr = ENC_CODERANGE(str);
6882
6883 if (RSTRING_LEN(str) > 1) {
6884 if (single_byte_optimizable(str)) {
6885 while (s < e) {
6886 *--p = *s++;
6887 }
6888 }
6889 else if (cr == ENC_CODERANGE_VALID) {
6890 while (s < e) {
6891 int clen = rb_enc_fast_mbclen(s, e, enc);
6892
6893 p -= clen;
6894 memcpy(p, s, clen);
6895 s += clen;
6896 }
6897 }
6898 else {
6899 cr = rb_enc_asciicompat(enc) ?
6901 while (s < e) {
6902 int clen = rb_enc_mbclen(s, e, enc);
6903
6904 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6905 p -= clen;
6906 memcpy(p, s, clen);
6907 s += clen;
6908 }
6909 }
6910 }
6911 STR_SET_LEN(rev, RSTRING_LEN(str));
6912 str_enc_copy_direct(rev, str);
6913 ENC_CODERANGE_SET(rev, cr);
6914
6915 return rev;
6916}
6917
6918
6919/*
6920 * call-seq:
6921 * reverse! -> self
6922 *
6923 * Returns +self+ with its characters reversed:
6924 *
6925 * s = 'stressed'
6926 * s.reverse! # => "desserts"
6927 * s # => "desserts"
6928 *
6929 */
6930
6931static VALUE
6932rb_str_reverse_bang(VALUE str)
6933{
6934 if (RSTRING_LEN(str) > 1) {
6935 if (single_byte_optimizable(str)) {
6936 char *s, *e, c;
6937
6938 str_modify_keep_cr(str);
6939 s = RSTRING_PTR(str);
6940 e = RSTRING_END(str) - 1;
6941 while (s < e) {
6942 c = *s;
6943 *s++ = *e;
6944 *e-- = c;
6945 }
6946 }
6947 else {
6948 str_shared_replace(str, rb_str_reverse(str));
6949 }
6950 }
6951 else {
6952 str_modify_keep_cr(str);
6953 }
6954 return str;
6955}
6956
6957
6958/*
6959 * call-seq:
6960 * include?(other_string) -> true or false
6961 *
6962 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6963 *
6964 * s = 'foo'
6965 * s.include?('f') # => true
6966 * s.include?('fo') # => true
6967 * s.include?('food') # => false
6968 *
6969 */
6970
6971VALUE
6972rb_str_include(VALUE str, VALUE arg)
6973{
6974 long i;
6975
6976 StringValue(arg);
6977 i = rb_str_index(str, arg, 0);
6978
6979 return RBOOL(i != -1);
6980}
6981
6982
6983/*
6984 * call-seq:
6985 * to_i(base = 10) -> integer
6986 *
6987 * Returns the result of interpreting leading characters in +self+
6988 * as an integer in the given +base+ (which must be in (0, 2..36)):
6989 *
6990 * '123456'.to_i # => 123456
6991 * '123def'.to_i(16) # => 1195503
6992 *
6993 * With +base+ zero, string +object+ may contain leading characters
6994 * to specify the actual base:
6995 *
6996 * '123def'.to_i(0) # => 123
6997 * '0123def'.to_i(0) # => 83
6998 * '0b123def'.to_i(0) # => 1
6999 * '0o123def'.to_i(0) # => 83
7000 * '0d123def'.to_i(0) # => 123
7001 * '0x123def'.to_i(0) # => 1195503
7002 *
7003 * Characters past a leading valid number (in the given +base+) are ignored:
7004 *
7005 * '12.345'.to_i # => 12
7006 * '12345'.to_i(2) # => 1
7007 *
7008 * Returns zero if there is no leading valid number:
7009 *
7010 * 'abcdef'.to_i # => 0
7011 * '2'.to_i(2) # => 0
7012 *
7013 */
7014
7015static VALUE
7016rb_str_to_i(int argc, VALUE *argv, VALUE str)
7017{
7018 int base = 10;
7019
7020 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7021 rb_raise(rb_eArgError, "invalid radix %d", base);
7022 }
7023 return rb_str_to_inum(str, base, FALSE);
7024}
7025
7026
7027/*
7028 * call-seq:
7029 * to_f -> float
7030 *
7031 * Returns the result of interpreting leading characters in +self+ as a Float:
7032 *
7033 * '3.14159'.to_f # => 3.14159
7034 * '1.234e-2'.to_f # => 0.01234
7035 *
7036 * Characters past a leading valid number (in the given +base+) are ignored:
7037 *
7038 * '3.14 (pi to two places)'.to_f # => 3.14
7039 *
7040 * Returns zero if there is no leading valid number:
7041 *
7042 * 'abcdef'.to_f # => 0.0
7043 *
7044 */
7045
7046static VALUE
7047rb_str_to_f(VALUE str)
7048{
7049 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7050}
7051
7052
7053/*
7054 * call-seq:
7055 * to_s -> self or string
7056 *
7057 * Returns +self+ if +self+ is a +String+,
7058 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7059 */
7060
7061static VALUE
7062rb_str_to_s(VALUE str)
7063{
7064 if (rb_obj_class(str) != rb_cString) {
7065 return str_duplicate(rb_cString, str);
7066 }
7067 return str;
7068}
7069
7070#if 0
7071static void
7072str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7073{
7074 char s[RUBY_MAX_CHAR_LEN];
7075 int n = rb_enc_codelen(c, enc);
7076
7077 rb_enc_mbcput(c, s, enc);
7078 rb_enc_str_buf_cat(str, s, n, enc);
7079}
7080#endif
7081
7082#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7083
7084int
7085rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7086{
7087 char buf[CHAR_ESC_LEN + 1];
7088 int l;
7089
7090#if SIZEOF_INT > 4
7091 c &= 0xffffffff;
7092#endif
7093 if (unicode_p) {
7094 if (c < 0x7F && ISPRINT(c)) {
7095 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7096 }
7097 else if (c < 0x10000) {
7098 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7099 }
7100 else {
7101 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7102 }
7103 }
7104 else {
7105 if (c < 0x100) {
7106 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7107 }
7108 else {
7109 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7110 }
7111 }
7112 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7113 rb_str_buf_cat(result, buf, l);
7114 return l;
7115}
7116
7117const char *
7118ruby_escaped_char(int c)
7119{
7120 switch (c) {
7121 case '\0': return "\\0";
7122 case '\n': return "\\n";
7123 case '\r': return "\\r";
7124 case '\t': return "\\t";
7125 case '\f': return "\\f";
7126 case '\013': return "\\v";
7127 case '\010': return "\\b";
7128 case '\007': return "\\a";
7129 case '\033': return "\\e";
7130 case '\x7f': return "\\c?";
7131 }
7132 return NULL;
7133}
7134
7135VALUE
7136rb_str_escape(VALUE str)
7137{
7138 int encidx = ENCODING_GET(str);
7139 rb_encoding *enc = rb_enc_from_index(encidx);
7140 const char *p = RSTRING_PTR(str);
7141 const char *pend = RSTRING_END(str);
7142 const char *prev = p;
7143 char buf[CHAR_ESC_LEN + 1];
7144 VALUE result = rb_str_buf_new(0);
7145 int unicode_p = rb_enc_unicode_p(enc);
7146 int asciicompat = rb_enc_asciicompat(enc);
7147
7148 while (p < pend) {
7149 unsigned int c;
7150 const char *cc;
7151 int n = rb_enc_precise_mbclen(p, pend, enc);
7152 if (!MBCLEN_CHARFOUND_P(n)) {
7153 if (p > prev) str_buf_cat(result, prev, p - prev);
7154 n = rb_enc_mbminlen(enc);
7155 if (pend < p + n)
7156 n = (int)(pend - p);
7157 while (n--) {
7158 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7159 str_buf_cat(result, buf, strlen(buf));
7160 prev = ++p;
7161 }
7162 continue;
7163 }
7164 n = MBCLEN_CHARFOUND_LEN(n);
7165 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7166 p += n;
7167 cc = ruby_escaped_char(c);
7168 if (cc) {
7169 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7170 str_buf_cat(result, cc, strlen(cc));
7171 prev = p;
7172 }
7173 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7174 }
7175 else {
7176 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7177 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7178 prev = p;
7179 }
7180 }
7181 if (p > prev) str_buf_cat(result, prev, p - prev);
7182 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7183
7184 return result;
7185}
7186
7187/*
7188 * call-seq:
7189 * inspect -> string
7190 *
7191 * Returns a printable version of +self+, enclosed in double-quotes,
7192 * and with special characters escaped:
7193 *
7194 * s = "foo\tbar\tbaz\n"
7195 * s.inspect
7196 * # => "\"foo\\tbar\\tbaz\\n\""
7197 *
7198 */
7199
7200VALUE
7202{
7203 int encidx = ENCODING_GET(str);
7204 rb_encoding *enc = rb_enc_from_index(encidx);
7205 const char *p, *pend, *prev;
7206 char buf[CHAR_ESC_LEN + 1];
7207 VALUE result = rb_str_buf_new(0);
7208 rb_encoding *resenc = rb_default_internal_encoding();
7209 int unicode_p = rb_enc_unicode_p(enc);
7210 int asciicompat = rb_enc_asciicompat(enc);
7211
7212 if (resenc == NULL) resenc = rb_default_external_encoding();
7213 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7214 rb_enc_associate(result, resenc);
7215 str_buf_cat2(result, "\"");
7216
7217 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7218 prev = p;
7219 while (p < pend) {
7220 unsigned int c, cc;
7221 int n;
7222
7223 n = rb_enc_precise_mbclen(p, pend, enc);
7224 if (!MBCLEN_CHARFOUND_P(n)) {
7225 if (p > prev) str_buf_cat(result, prev, p - prev);
7226 n = rb_enc_mbminlen(enc);
7227 if (pend < p + n)
7228 n = (int)(pend - p);
7229 while (n--) {
7230 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7231 str_buf_cat(result, buf, strlen(buf));
7232 prev = ++p;
7233 }
7234 continue;
7235 }
7236 n = MBCLEN_CHARFOUND_LEN(n);
7237 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7238 p += n;
7239 if ((asciicompat || unicode_p) &&
7240 (c == '"'|| c == '\\' ||
7241 (c == '#' &&
7242 p < pend &&
7243 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7244 (cc = rb_enc_codepoint(p,pend,enc),
7245 (cc == '$' || cc == '@' || cc == '{'))))) {
7246 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7247 str_buf_cat2(result, "\\");
7248 if (asciicompat || enc == resenc) {
7249 prev = p - n;
7250 continue;
7251 }
7252 }
7253 switch (c) {
7254 case '\n': cc = 'n'; break;
7255 case '\r': cc = 'r'; break;
7256 case '\t': cc = 't'; break;
7257 case '\f': cc = 'f'; break;
7258 case '\013': cc = 'v'; break;
7259 case '\010': cc = 'b'; break;
7260 case '\007': cc = 'a'; break;
7261 case 033: cc = 'e'; break;
7262 default: cc = 0; break;
7263 }
7264 if (cc) {
7265 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7266 buf[0] = '\\';
7267 buf[1] = (char)cc;
7268 str_buf_cat(result, buf, 2);
7269 prev = p;
7270 continue;
7271 }
7272 /* The special casing of 0x85 (NEXT_LINE) here is because
7273 * Oniguruma historically treats it as printable, but it
7274 * doesn't match the print POSIX bracket class or character
7275 * property in regexps.
7276 *
7277 * See Ruby Bug #16842 for details:
7278 * https://bugs.ruby-lang.org/issues/16842
7279 */
7280 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7281 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7282 continue;
7283 }
7284 else {
7285 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7286 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7287 prev = p;
7288 continue;
7289 }
7290 }
7291 if (p > prev) str_buf_cat(result, prev, p - prev);
7292 str_buf_cat2(result, "\"");
7293
7294 return result;
7295}
7296
7297#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7298
7299/*
7300 * call-seq:
7301 * dump -> string
7302 *
7303 * Returns a printable version of +self+, enclosed in double-quotes,
7304 * with special characters escaped, and with non-printing characters
7305 * replaced by hexadecimal notation:
7306 *
7307 * "hello \n ''".dump # => "\"hello \\n ''\""
7308 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7309 *
7310 * Related: String#undump (inverse of String#dump).
7311 *
7312 */
7313
7314VALUE
7316{
7317 int encidx = rb_enc_get_index(str);
7318 rb_encoding *enc = rb_enc_from_index(encidx);
7319 long len;
7320 const char *p, *pend;
7321 char *q, *qend;
7322 VALUE result;
7323 int u8 = (encidx == rb_utf8_encindex());
7324 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7325
7326 len = 2; /* "" */
7327 if (!rb_enc_asciicompat(enc)) {
7328 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7329 len += strlen(enc->name);
7330 }
7331
7332 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7333 while (p < pend) {
7334 int clen;
7335 unsigned char c = *p++;
7336
7337 switch (c) {
7338 case '"': case '\\':
7339 case '\n': case '\r':
7340 case '\t': case '\f':
7341 case '\013': case '\010': case '\007': case '\033':
7342 clen = 2;
7343 break;
7344
7345 case '#':
7346 clen = IS_EVSTR(p, pend) ? 2 : 1;
7347 break;
7348
7349 default:
7350 if (ISPRINT(c)) {
7351 clen = 1;
7352 }
7353 else {
7354 if (u8 && c > 0x7F) { /* \u notation */
7355 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7356 if (MBCLEN_CHARFOUND_P(n)) {
7357 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7358 if (cc <= 0xFFFF)
7359 clen = 6; /* \uXXXX */
7360 else if (cc <= 0xFFFFF)
7361 clen = 9; /* \u{XXXXX} */
7362 else
7363 clen = 10; /* \u{XXXXXX} */
7364 p += MBCLEN_CHARFOUND_LEN(n)-1;
7365 break;
7366 }
7367 }
7368 clen = 4; /* \xNN */
7369 }
7370 break;
7371 }
7372
7373 if (clen > LONG_MAX - len) {
7374 rb_raise(rb_eRuntimeError, "string size too big");
7375 }
7376 len += clen;
7377 }
7378
7379 result = rb_str_new(0, len);
7380 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7381 q = RSTRING_PTR(result); qend = q + len + 1;
7382
7383 *q++ = '"';
7384 while (p < pend) {
7385 unsigned char c = *p++;
7386
7387 if (c == '"' || c == '\\') {
7388 *q++ = '\\';
7389 *q++ = c;
7390 }
7391 else if (c == '#') {
7392 if (IS_EVSTR(p, pend)) *q++ = '\\';
7393 *q++ = '#';
7394 }
7395 else if (c == '\n') {
7396 *q++ = '\\';
7397 *q++ = 'n';
7398 }
7399 else if (c == '\r') {
7400 *q++ = '\\';
7401 *q++ = 'r';
7402 }
7403 else if (c == '\t') {
7404 *q++ = '\\';
7405 *q++ = 't';
7406 }
7407 else if (c == '\f') {
7408 *q++ = '\\';
7409 *q++ = 'f';
7410 }
7411 else if (c == '\013') {
7412 *q++ = '\\';
7413 *q++ = 'v';
7414 }
7415 else if (c == '\010') {
7416 *q++ = '\\';
7417 *q++ = 'b';
7418 }
7419 else if (c == '\007') {
7420 *q++ = '\\';
7421 *q++ = 'a';
7422 }
7423 else if (c == '\033') {
7424 *q++ = '\\';
7425 *q++ = 'e';
7426 }
7427 else if (ISPRINT(c)) {
7428 *q++ = c;
7429 }
7430 else {
7431 *q++ = '\\';
7432 if (u8) {
7433 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7434 if (MBCLEN_CHARFOUND_P(n)) {
7435 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7436 p += n;
7437 if (cc <= 0xFFFF)
7438 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7439 else
7440 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7441 q += strlen(q);
7442 continue;
7443 }
7444 }
7445 snprintf(q, qend-q, "x%02X", c);
7446 q += 3;
7447 }
7448 }
7449 *q++ = '"';
7450 *q = '\0';
7451 if (!rb_enc_asciicompat(enc)) {
7452 snprintf(q, qend-q, nonascii_suffix, enc->name);
7453 encidx = rb_ascii8bit_encindex();
7454 }
7455 /* result from dump is ASCII */
7456 rb_enc_associate_index(result, encidx);
7458 return result;
7459}
7460
7461static int
7462unescape_ascii(unsigned int c)
7463{
7464 switch (c) {
7465 case 'n':
7466 return '\n';
7467 case 'r':
7468 return '\r';
7469 case 't':
7470 return '\t';
7471 case 'f':
7472 return '\f';
7473 case 'v':
7474 return '\13';
7475 case 'b':
7476 return '\010';
7477 case 'a':
7478 return '\007';
7479 case 'e':
7480 return 033;
7481 }
7483}
7484
7485static void
7486undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7487{
7488 const char *s = *ss;
7489 unsigned int c;
7490 int codelen;
7491 size_t hexlen;
7492 unsigned char buf[6];
7493 static rb_encoding *enc_utf8 = NULL;
7494
7495 switch (*s) {
7496 case '\\':
7497 case '"':
7498 case '#':
7499 rb_str_cat(undumped, s, 1); /* cat itself */
7500 s++;
7501 break;
7502 case 'n':
7503 case 'r':
7504 case 't':
7505 case 'f':
7506 case 'v':
7507 case 'b':
7508 case 'a':
7509 case 'e':
7510 *buf = unescape_ascii(*s);
7511 rb_str_cat(undumped, (char *)buf, 1);
7512 s++;
7513 break;
7514 case 'u':
7515 if (*binary) {
7516 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7517 }
7518 *utf8 = true;
7519 if (++s >= s_end) {
7520 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7521 }
7522 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7523 if (*penc != enc_utf8) {
7524 *penc = enc_utf8;
7525 rb_enc_associate(undumped, enc_utf8);
7526 }
7527 if (*s == '{') { /* handle \u{...} form */
7528 s++;
7529 for (;;) {
7530 if (s >= s_end) {
7531 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7532 }
7533 if (*s == '}') {
7534 s++;
7535 break;
7536 }
7537 if (ISSPACE(*s)) {
7538 s++;
7539 continue;
7540 }
7541 c = scan_hex(s, s_end-s, &hexlen);
7542 if (hexlen == 0 || hexlen > 6) {
7543 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7544 }
7545 if (c > 0x10ffff) {
7546 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7547 }
7548 if (0xd800 <= c && c <= 0xdfff) {
7549 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7550 }
7551 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7552 rb_str_cat(undumped, (char *)buf, codelen);
7553 s += hexlen;
7554 }
7555 }
7556 else { /* handle \uXXXX form */
7557 c = scan_hex(s, 4, &hexlen);
7558 if (hexlen != 4) {
7559 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7560 }
7561 if (0xd800 <= c && c <= 0xdfff) {
7562 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7563 }
7564 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7565 rb_str_cat(undumped, (char *)buf, codelen);
7566 s += hexlen;
7567 }
7568 break;
7569 case 'x':
7570 if (*utf8) {
7571 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7572 }
7573 *binary = true;
7574 if (++s >= s_end) {
7575 rb_raise(rb_eRuntimeError, "invalid hex escape");
7576 }
7577 *buf = scan_hex(s, 2, &hexlen);
7578 if (hexlen != 2) {
7579 rb_raise(rb_eRuntimeError, "invalid hex escape");
7580 }
7581 rb_str_cat(undumped, (char *)buf, 1);
7582 s += hexlen;
7583 break;
7584 default:
7585 rb_str_cat(undumped, s-1, 2);
7586 s++;
7587 }
7588
7589 *ss = s;
7590}
7591
7592static VALUE rb_str_is_ascii_only_p(VALUE str);
7593
7594/*
7595 * call-seq:
7596 * undump -> string
7597 *
7598 * Returns an unescaped version of +self+:
7599 *
7600 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7601 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7602 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7603 * s_undumped == s_orig # => true
7604 *
7605 * Related: String#dump (inverse of String#undump).
7606 *
7607 */
7608
7609static VALUE
7610str_undump(VALUE str)
7611{
7612 const char *s = RSTRING_PTR(str);
7613 const char *s_end = RSTRING_END(str);
7614 rb_encoding *enc = rb_enc_get(str);
7615 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7616 bool utf8 = false;
7617 bool binary = false;
7618 int w;
7619
7621 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7622 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7623 }
7624 if (!str_null_check(str, &w)) {
7625 rb_raise(rb_eRuntimeError, "string contains null byte");
7626 }
7627 if (RSTRING_LEN(str) < 2) goto invalid_format;
7628 if (*s != '"') goto invalid_format;
7629
7630 /* strip '"' at the start */
7631 s++;
7632
7633 for (;;) {
7634 if (s >= s_end) {
7635 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7636 }
7637
7638 if (*s == '"') {
7639 /* epilogue */
7640 s++;
7641 if (s == s_end) {
7642 /* ascii compatible dumped string */
7643 break;
7644 }
7645 else {
7646 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7647 static const char dup_suffix[] = ".dup";
7648 const char *encname;
7649 int encidx;
7650 ptrdiff_t size;
7651
7652 /* check separately for strings dumped by older versions */
7653 size = sizeof(dup_suffix) - 1;
7654 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7655
7656 size = sizeof(force_encoding_suffix) - 1;
7657 if (s_end - s <= size) goto invalid_format;
7658 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7659 s += size;
7660
7661 if (utf8) {
7662 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7663 }
7664
7665 encname = s;
7666 s = memchr(s, '"', s_end-s);
7667 size = s - encname;
7668 if (!s) goto invalid_format;
7669 if (s_end - s != 2) goto invalid_format;
7670 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7671
7672 encidx = rb_enc_find_index2(encname, (long)size);
7673 if (encidx < 0) {
7674 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7675 }
7676 rb_enc_associate_index(undumped, encidx);
7677 }
7678 break;
7679 }
7680
7681 if (*s == '\\') {
7682 s++;
7683 if (s >= s_end) {
7684 rb_raise(rb_eRuntimeError, "invalid escape");
7685 }
7686 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7687 }
7688 else {
7689 rb_str_cat(undumped, s++, 1);
7690 }
7691 }
7692
7693 RB_GC_GUARD(str);
7694
7695 return undumped;
7696invalid_format:
7697 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7698}
7699
7700static void
7701rb_str_check_dummy_enc(rb_encoding *enc)
7702{
7703 if (rb_enc_dummy_p(enc)) {
7704 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7705 rb_enc_name(enc));
7706 }
7707}
7708
7709static rb_encoding *
7710str_true_enc(VALUE str)
7711{
7712 rb_encoding *enc = STR_ENC_GET(str);
7713 rb_str_check_dummy_enc(enc);
7714 return enc;
7715}
7716
7717static OnigCaseFoldType
7718check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7719{
7720 if (argc==0)
7721 return flags;
7722 if (argc>2)
7723 rb_raise(rb_eArgError, "too many options");
7724 if (argv[0]==sym_turkic) {
7725 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7726 if (argc==2) {
7727 if (argv[1]==sym_lithuanian)
7728 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7729 else
7730 rb_raise(rb_eArgError, "invalid second option");
7731 }
7732 }
7733 else if (argv[0]==sym_lithuanian) {
7734 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7735 if (argc==2) {
7736 if (argv[1]==sym_turkic)
7737 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7738 else
7739 rb_raise(rb_eArgError, "invalid second option");
7740 }
7741 }
7742 else if (argc>1)
7743 rb_raise(rb_eArgError, "too many options");
7744 else if (argv[0]==sym_ascii)
7745 flags |= ONIGENC_CASE_ASCII_ONLY;
7746 else if (argv[0]==sym_fold) {
7747 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7748 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7749 else
7750 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7751 }
7752 else
7753 rb_raise(rb_eArgError, "invalid option");
7754 return flags;
7755}
7756
7757static inline bool
7758case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7759{
7760 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7761 return true;
7762 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7763}
7764
7765/* 16 should be long enough to absorb any kind of single character length increase */
7766#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7767#ifndef CASEMAP_DEBUG
7768# define CASEMAP_DEBUG 0
7769#endif
7770
7771struct mapping_buffer;
7772typedef struct mapping_buffer {
7773 size_t capa;
7774 size_t used;
7775 struct mapping_buffer *next;
7776 OnigUChar space[FLEX_ARY_LEN];
7778
7779static void
7780mapping_buffer_free(void *p)
7781{
7782 mapping_buffer *previous_buffer;
7783 mapping_buffer *current_buffer = p;
7784 while (current_buffer) {
7785 previous_buffer = current_buffer;
7786 current_buffer = current_buffer->next;
7787 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7788 }
7789}
7790
7791static const rb_data_type_t mapping_buffer_type = {
7792 "mapping_buffer",
7793 {0, mapping_buffer_free,},
7794 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7795};
7796
7797static VALUE
7798rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7799{
7800 VALUE target;
7801
7802 const OnigUChar *source_current, *source_end;
7803 int target_length = 0;
7804 VALUE buffer_anchor;
7805 mapping_buffer *current_buffer = 0;
7806 mapping_buffer **pre_buffer;
7807 size_t buffer_count = 0;
7808 int buffer_length_or_invalid;
7809
7810 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7811
7812 source_current = (OnigUChar*)RSTRING_PTR(source);
7813 source_end = (OnigUChar*)RSTRING_END(source);
7814
7815 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7816 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7817 while (source_current < source_end) {
7818 /* increase multiplier using buffer count to converge quickly */
7819 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7820 if (CASEMAP_DEBUG) {
7821 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7822 }
7823 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7824 *pre_buffer = current_buffer;
7825 pre_buffer = &current_buffer->next;
7826 current_buffer->next = NULL;
7827 current_buffer->capa = capa;
7828 buffer_length_or_invalid = enc->case_map(flags,
7829 &source_current, source_end,
7830 current_buffer->space,
7831 current_buffer->space+current_buffer->capa,
7832 enc);
7833 if (buffer_length_or_invalid < 0) {
7834 current_buffer = DATA_PTR(buffer_anchor);
7835 DATA_PTR(buffer_anchor) = 0;
7836 mapping_buffer_free(current_buffer);
7837 rb_raise(rb_eArgError, "input string invalid");
7838 }
7839 target_length += current_buffer->used = buffer_length_or_invalid;
7840 }
7841 if (CASEMAP_DEBUG) {
7842 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7843 }
7844
7845 if (buffer_count==1) {
7846 target = rb_str_new((const char*)current_buffer->space, target_length);
7847 }
7848 else {
7849 char *target_current;
7850
7851 target = rb_str_new(0, target_length);
7852 target_current = RSTRING_PTR(target);
7853 current_buffer = DATA_PTR(buffer_anchor);
7854 while (current_buffer) {
7855 memcpy(target_current, current_buffer->space, current_buffer->used);
7856 target_current += current_buffer->used;
7857 current_buffer = current_buffer->next;
7858 }
7859 }
7860 current_buffer = DATA_PTR(buffer_anchor);
7861 DATA_PTR(buffer_anchor) = 0;
7862 mapping_buffer_free(current_buffer);
7863
7864 RB_GC_GUARD(buffer_anchor);
7865
7866 /* TODO: check about string terminator character */
7867 str_enc_copy_direct(target, source);
7868 /*ENC_CODERANGE_SET(mapped, cr);*/
7869
7870 return target;
7871}
7872
7873static VALUE
7874rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7875{
7876 const OnigUChar *source_current, *source_end;
7877 OnigUChar *target_current, *target_end;
7878 long old_length = RSTRING_LEN(source);
7879 int length_or_invalid;
7880
7881 if (old_length == 0) return Qnil;
7882
7883 source_current = (OnigUChar*)RSTRING_PTR(source);
7884 source_end = (OnigUChar*)RSTRING_END(source);
7885 if (source == target) {
7886 target_current = (OnigUChar*)source_current;
7887 target_end = (OnigUChar*)source_end;
7888 }
7889 else {
7890 target_current = (OnigUChar*)RSTRING_PTR(target);
7891 target_end = (OnigUChar*)RSTRING_END(target);
7892 }
7893
7894 length_or_invalid = onigenc_ascii_only_case_map(flags,
7895 &source_current, source_end,
7896 target_current, target_end, enc);
7897 if (length_or_invalid < 0)
7898 rb_raise(rb_eArgError, "input string invalid");
7899 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7900 fprintf(stderr, "problem with rb_str_ascii_casemap"
7901 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7902 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7903 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7904 }
7905
7906 str_enc_copy(target, source);
7907
7908 return target;
7909}
7910
7911static bool
7912upcase_single(VALUE str)
7913{
7914 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7915 bool modified = false;
7916
7917 while (s < send) {
7918 unsigned int c = *(unsigned char*)s;
7919
7920 if ('a' <= c && c <= 'z') {
7921 *s = 'A' + (c - 'a');
7922 modified = true;
7923 }
7924 s++;
7925 }
7926 return modified;
7927}
7928
7929/*
7930 * call-seq:
7931 * upcase!(*options) -> self or nil
7932 *
7933 * Upcases the characters in +self+;
7934 * returns +self+ if any changes were made, +nil+ otherwise:
7935 *
7936 * s = 'Hello World!' # => "Hello World!"
7937 * s.upcase! # => "HELLO WORLD!"
7938 * s # => "HELLO WORLD!"
7939 * s.upcase! # => nil
7940 *
7941 * The casing may be affected by the given +options+;
7942 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7943 *
7944 * Related: String#upcase, String#downcase, String#downcase!.
7945 *
7946 */
7947
7948static VALUE
7949rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7950{
7951 rb_encoding *enc;
7952 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7953
7954 flags = check_case_options(argc, argv, flags);
7955 str_modify_keep_cr(str);
7956 enc = str_true_enc(str);
7957 if (case_option_single_p(flags, enc, str)) {
7958 if (upcase_single(str))
7959 flags |= ONIGENC_CASE_MODIFIED;
7960 }
7961 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7962 rb_str_ascii_casemap(str, str, &flags, enc);
7963 else
7964 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7965
7966 if (ONIGENC_CASE_MODIFIED&flags) return str;
7967 return Qnil;
7968}
7969
7970
7971/*
7972 * call-seq:
7973 * upcase(*options) -> string
7974 *
7975 * Returns a string containing the upcased characters in +self+:
7976 *
7977 * s = 'Hello World!' # => "Hello World!"
7978 * s.upcase # => "HELLO WORLD!"
7979 *
7980 * The casing may be affected by the given +options+;
7981 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7982 *
7983 * Related: String#upcase!, String#downcase, String#downcase!.
7984 *
7985 */
7986
7987static VALUE
7988rb_str_upcase(int argc, VALUE *argv, VALUE str)
7989{
7990 rb_encoding *enc;
7991 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7992 VALUE ret;
7993
7994 flags = check_case_options(argc, argv, flags);
7995 enc = str_true_enc(str);
7996 if (case_option_single_p(flags, enc, str)) {
7997 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7998 str_enc_copy_direct(ret, str);
7999 upcase_single(ret);
8000 }
8001 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8002 ret = rb_str_new(0, RSTRING_LEN(str));
8003 rb_str_ascii_casemap(str, ret, &flags, enc);
8004 }
8005 else {
8006 ret = rb_str_casemap(str, &flags, enc);
8007 }
8008
8009 return ret;
8010}
8011
8012static bool
8013downcase_single(VALUE str)
8014{
8015 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8016 bool modified = false;
8017
8018 while (s < send) {
8019 unsigned int c = *(unsigned char*)s;
8020
8021 if ('A' <= c && c <= 'Z') {
8022 *s = 'a' + (c - 'A');
8023 modified = true;
8024 }
8025 s++;
8026 }
8027
8028 return modified;
8029}
8030
8031/*
8032 * call-seq:
8033 * downcase!(*options) -> self or nil
8034 *
8035 * Downcases the characters in +self+;
8036 * returns +self+ if any changes were made, +nil+ otherwise:
8037 *
8038 * s = 'Hello World!' # => "Hello World!"
8039 * s.downcase! # => "hello world!"
8040 * s # => "hello world!"
8041 * s.downcase! # => nil
8042 *
8043 * The casing may be affected by the given +options+;
8044 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8045 *
8046 * Related: String#downcase, String#upcase, String#upcase!.
8047 *
8048 */
8049
8050static VALUE
8051rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8052{
8053 rb_encoding *enc;
8054 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8055
8056 flags = check_case_options(argc, argv, flags);
8057 str_modify_keep_cr(str);
8058 enc = str_true_enc(str);
8059 if (case_option_single_p(flags, enc, str)) {
8060 if (downcase_single(str))
8061 flags |= ONIGENC_CASE_MODIFIED;
8062 }
8063 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8064 rb_str_ascii_casemap(str, str, &flags, enc);
8065 else
8066 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8067
8068 if (ONIGENC_CASE_MODIFIED&flags) return str;
8069 return Qnil;
8070}
8071
8072
8073/*
8074 * call-seq:
8075 * downcase(*options) -> string
8076 *
8077 * Returns a string containing the downcased characters in +self+:
8078 *
8079 * s = 'Hello World!' # => "Hello World!"
8080 * s.downcase # => "hello world!"
8081 *
8082 * The casing may be affected by the given +options+;
8083 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8084 *
8085 * Related: String#downcase!, String#upcase, String#upcase!.
8086 *
8087 */
8088
8089static VALUE
8090rb_str_downcase(int argc, VALUE *argv, VALUE str)
8091{
8092 rb_encoding *enc;
8093 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8094 VALUE ret;
8095
8096 flags = check_case_options(argc, argv, flags);
8097 enc = str_true_enc(str);
8098 if (case_option_single_p(flags, enc, str)) {
8099 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8100 str_enc_copy_direct(ret, str);
8101 downcase_single(ret);
8102 }
8103 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8104 ret = rb_str_new(0, RSTRING_LEN(str));
8105 rb_str_ascii_casemap(str, ret, &flags, enc);
8106 }
8107 else {
8108 ret = rb_str_casemap(str, &flags, enc);
8109 }
8110
8111 return ret;
8112}
8113
8114
8115/*
8116 * call-seq:
8117 * capitalize!(*options) -> self or nil
8118 *
8119 * Upcases the first character in +self+;
8120 * downcases the remaining characters;
8121 * returns +self+ if any changes were made, +nil+ otherwise:
8122 *
8123 * s = 'hello World!' # => "hello World!"
8124 * s.capitalize! # => "Hello world!"
8125 * s # => "Hello world!"
8126 * s.capitalize! # => nil
8127 *
8128 * The casing may be affected by the given +options+;
8129 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8130 *
8131 * Related: String#capitalize.
8132 *
8133 */
8134
8135static VALUE
8136rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8137{
8138 rb_encoding *enc;
8139 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8140
8141 flags = check_case_options(argc, argv, flags);
8142 str_modify_keep_cr(str);
8143 enc = str_true_enc(str);
8144 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8145 if (flags&ONIGENC_CASE_ASCII_ONLY)
8146 rb_str_ascii_casemap(str, str, &flags, enc);
8147 else
8148 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8149
8150 if (ONIGENC_CASE_MODIFIED&flags) return str;
8151 return Qnil;
8152}
8153
8154
8155/*
8156 * call-seq:
8157 * capitalize(*options) -> string
8158 *
8159 * Returns a string containing the characters in +self+;
8160 * the first character is upcased;
8161 * the remaining characters are downcased:
8162 *
8163 * s = 'hello World!' # => "hello World!"
8164 * s.capitalize # => "Hello world!"
8165 *
8166 * The casing may be affected by the given +options+;
8167 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8168 *
8169 * Related: String#capitalize!.
8170 *
8171 */
8172
8173static VALUE
8174rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8175{
8176 rb_encoding *enc;
8177 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8178 VALUE ret;
8179
8180 flags = check_case_options(argc, argv, flags);
8181 enc = str_true_enc(str);
8182 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8183 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8184 ret = rb_str_new(0, RSTRING_LEN(str));
8185 rb_str_ascii_casemap(str, ret, &flags, enc);
8186 }
8187 else {
8188 ret = rb_str_casemap(str, &flags, enc);
8189 }
8190 return ret;
8191}
8192
8193
8194/*
8195 * call-seq:
8196 * swapcase!(*options) -> self or nil
8197 *
8198 * Upcases each lowercase character in +self+;
8199 * downcases uppercase character;
8200 * returns +self+ if any changes were made, +nil+ otherwise:
8201 *
8202 * s = 'Hello World!' # => "Hello World!"
8203 * s.swapcase! # => "hELLO wORLD!"
8204 * s # => "hELLO wORLD!"
8205 * ''.swapcase! # => nil
8206 *
8207 * The casing may be affected by the given +options+;
8208 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8209 *
8210 * Related: String#swapcase.
8211 *
8212 */
8213
8214static VALUE
8215rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8216{
8217 rb_encoding *enc;
8218 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8219
8220 flags = check_case_options(argc, argv, flags);
8221 str_modify_keep_cr(str);
8222 enc = str_true_enc(str);
8223 if (flags&ONIGENC_CASE_ASCII_ONLY)
8224 rb_str_ascii_casemap(str, str, &flags, enc);
8225 else
8226 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8227
8228 if (ONIGENC_CASE_MODIFIED&flags) return str;
8229 return Qnil;
8230}
8231
8232
8233/*
8234 * call-seq:
8235 * swapcase(*options) -> string
8236 *
8237 * Returns a string containing the characters in +self+, with cases reversed;
8238 * each uppercase character is downcased;
8239 * each lowercase character is upcased:
8240 *
8241 * s = 'Hello World!' # => "Hello World!"
8242 * s.swapcase # => "hELLO wORLD!"
8243 *
8244 * The casing may be affected by the given +options+;
8245 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8246 *
8247 * Related: String#swapcase!.
8248 *
8249 */
8250
8251static VALUE
8252rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8253{
8254 rb_encoding *enc;
8255 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8256 VALUE ret;
8257
8258 flags = check_case_options(argc, argv, flags);
8259 enc = str_true_enc(str);
8260 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8261 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8262 ret = rb_str_new(0, RSTRING_LEN(str));
8263 rb_str_ascii_casemap(str, ret, &flags, enc);
8264 }
8265 else {
8266 ret = rb_str_casemap(str, &flags, enc);
8267 }
8268 return ret;
8269}
8270
8271typedef unsigned char *USTR;
8272
8273struct tr {
8274 int gen;
8275 unsigned int now, max;
8276 char *p, *pend;
8277};
8278
8279static unsigned int
8280trnext(struct tr *t, rb_encoding *enc)
8281{
8282 int n;
8283
8284 for (;;) {
8285 nextpart:
8286 if (!t->gen) {
8287 if (t->p == t->pend) return -1;
8288 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8289 t->p += n;
8290 }
8291 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8292 t->p += n;
8293 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8294 t->p += n;
8295 if (t->p < t->pend) {
8296 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8297 t->p += n;
8298 if (t->now > c) {
8299 if (t->now < 0x80 && c < 0x80) {
8300 rb_raise(rb_eArgError,
8301 "invalid range \"%c-%c\" in string transliteration",
8302 t->now, c);
8303 }
8304 else {
8305 rb_raise(rb_eArgError, "invalid range in string transliteration");
8306 }
8307 continue; /* not reached */
8308 }
8309 else if (t->now < c) {
8310 t->gen = 1;
8311 t->max = c;
8312 }
8313 }
8314 }
8315 return t->now;
8316 }
8317 else {
8318 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8319 if (t->now == t->max) {
8320 t->gen = 0;
8321 goto nextpart;
8322 }
8323 }
8324 if (t->now < t->max) {
8325 return t->now;
8326 }
8327 else {
8328 t->gen = 0;
8329 return t->max;
8330 }
8331 }
8332 }
8333}
8334
8335static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8336
8337static VALUE
8338tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8339{
8340 const unsigned int errc = -1;
8341 unsigned int trans[256];
8342 rb_encoding *enc, *e1, *e2;
8343 struct tr trsrc, trrepl;
8344 int cflag = 0;
8345 unsigned int c, c0, last = 0;
8346 int modify = 0, i, l;
8347 unsigned char *s, *send;
8348 VALUE hash = 0;
8349 int singlebyte = single_byte_optimizable(str);
8350 int termlen;
8351 int cr;
8352
8353#define CHECK_IF_ASCII(c) \
8354 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8355 (cr = ENC_CODERANGE_VALID) : 0)
8356
8357 StringValue(src);
8358 StringValue(repl);
8359 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8360 if (RSTRING_LEN(repl) == 0) {
8361 return rb_str_delete_bang(1, &src, str);
8362 }
8363
8364 cr = ENC_CODERANGE(str);
8365 e1 = rb_enc_check(str, src);
8366 e2 = rb_enc_check(str, repl);
8367 if (e1 == e2) {
8368 enc = e1;
8369 }
8370 else {
8371 enc = rb_enc_check(src, repl);
8372 }
8373 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8374 if (RSTRING_LEN(src) > 1 &&
8375 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8376 trsrc.p + l < trsrc.pend) {
8377 cflag = 1;
8378 trsrc.p += l;
8379 }
8380 trrepl.p = RSTRING_PTR(repl);
8381 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8382 trsrc.gen = trrepl.gen = 0;
8383 trsrc.now = trrepl.now = 0;
8384 trsrc.max = trrepl.max = 0;
8385
8386 if (cflag) {
8387 for (i=0; i<256; i++) {
8388 trans[i] = 1;
8389 }
8390 while ((c = trnext(&trsrc, enc)) != errc) {
8391 if (c < 256) {
8392 trans[c] = errc;
8393 }
8394 else {
8395 if (!hash) hash = rb_hash_new();
8396 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8397 }
8398 }
8399 while ((c = trnext(&trrepl, enc)) != errc)
8400 /* retrieve last replacer */;
8401 last = trrepl.now;
8402 for (i=0; i<256; i++) {
8403 if (trans[i] != errc) {
8404 trans[i] = last;
8405 }
8406 }
8407 }
8408 else {
8409 unsigned int r;
8410
8411 for (i=0; i<256; i++) {
8412 trans[i] = errc;
8413 }
8414 while ((c = trnext(&trsrc, enc)) != errc) {
8415 r = trnext(&trrepl, enc);
8416 if (r == errc) r = trrepl.now;
8417 if (c < 256) {
8418 trans[c] = r;
8419 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8420 }
8421 else {
8422 if (!hash) hash = rb_hash_new();
8423 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8424 }
8425 }
8426 }
8427
8428 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8429 cr = ENC_CODERANGE_7BIT;
8430 str_modify_keep_cr(str);
8431 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8432 termlen = rb_enc_mbminlen(enc);
8433 if (sflag) {
8434 int clen, tlen;
8435 long offset, max = RSTRING_LEN(str);
8436 unsigned int save = -1;
8437 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8438
8439 while (s < send) {
8440 int may_modify = 0;
8441
8442 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8443 if (!MBCLEN_CHARFOUND_P(r)) {
8444 xfree(buf);
8445 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8446 }
8447 clen = MBCLEN_CHARFOUND_LEN(r);
8448 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8449
8450 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8451
8452 s += clen;
8453 if (c < 256) {
8454 c = trans[c];
8455 }
8456 else if (hash) {
8457 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8458 if (NIL_P(tmp)) {
8459 if (cflag) c = last;
8460 else c = errc;
8461 }
8462 else if (cflag) c = errc;
8463 else c = NUM2INT(tmp);
8464 }
8465 else {
8466 c = errc;
8467 }
8468 if (c != (unsigned int)-1) {
8469 if (save == c) {
8470 CHECK_IF_ASCII(c);
8471 continue;
8472 }
8473 save = c;
8474 tlen = rb_enc_codelen(c, enc);
8475 modify = 1;
8476 }
8477 else {
8478 save = -1;
8479 c = c0;
8480 if (enc != e1) may_modify = 1;
8481 }
8482 if ((offset = t - buf) + tlen > max) {
8483 size_t MAYBE_UNUSED(old) = max + termlen;
8484 max = offset + tlen + (send - s);
8485 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8486 t = buf + offset;
8487 }
8488 rb_enc_mbcput(c, t, enc);
8489 if (may_modify && memcmp(s, t, tlen) != 0) {
8490 modify = 1;
8491 }
8492 CHECK_IF_ASCII(c);
8493 t += tlen;
8494 }
8495 if (!STR_EMBED_P(str)) {
8496 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8497 }
8498 TERM_FILL((char *)t, termlen);
8499 RSTRING(str)->as.heap.ptr = (char *)buf;
8500 STR_SET_LEN(str, t - buf);
8501 STR_SET_NOEMBED(str);
8502 RSTRING(str)->as.heap.aux.capa = max;
8503 }
8504 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8505 while (s < send) {
8506 c = (unsigned char)*s;
8507 if (trans[c] != errc) {
8508 if (!cflag) {
8509 c = trans[c];
8510 *s = c;
8511 modify = 1;
8512 }
8513 else {
8514 *s = last;
8515 modify = 1;
8516 }
8517 }
8518 CHECK_IF_ASCII(c);
8519 s++;
8520 }
8521 }
8522 else {
8523 int clen, tlen;
8524 long offset, max = (long)((send - s) * 1.2);
8525 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8526
8527 while (s < send) {
8528 int may_modify = 0;
8529
8530 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8531 if (!MBCLEN_CHARFOUND_P(r)) {
8532 xfree(buf);
8533 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8534 }
8535 clen = MBCLEN_CHARFOUND_LEN(r);
8536 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8537
8538 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8539
8540 if (c < 256) {
8541 c = trans[c];
8542 }
8543 else if (hash) {
8544 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8545 if (NIL_P(tmp)) {
8546 if (cflag) c = last;
8547 else c = errc;
8548 }
8549 else if (cflag) c = errc;
8550 else c = NUM2INT(tmp);
8551 }
8552 else {
8553 c = cflag ? last : errc;
8554 }
8555 if (c != errc) {
8556 tlen = rb_enc_codelen(c, enc);
8557 modify = 1;
8558 }
8559 else {
8560 c = c0;
8561 if (enc != e1) may_modify = 1;
8562 }
8563 if ((offset = t - buf) + tlen > max) {
8564 size_t MAYBE_UNUSED(old) = max + termlen;
8565 max = offset + tlen + (long)((send - s) * 1.2);
8566 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8567 t = buf + offset;
8568 }
8569 if (s != t) {
8570 rb_enc_mbcput(c, t, enc);
8571 if (may_modify && memcmp(s, t, tlen) != 0) {
8572 modify = 1;
8573 }
8574 }
8575 CHECK_IF_ASCII(c);
8576 s += clen;
8577 t += tlen;
8578 }
8579 if (!STR_EMBED_P(str)) {
8580 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8581 }
8582 TERM_FILL((char *)t, termlen);
8583 RSTRING(str)->as.heap.ptr = (char *)buf;
8584 STR_SET_LEN(str, t - buf);
8585 STR_SET_NOEMBED(str);
8586 RSTRING(str)->as.heap.aux.capa = max;
8587 }
8588
8589 if (modify) {
8590 if (cr != ENC_CODERANGE_BROKEN)
8591 ENC_CODERANGE_SET(str, cr);
8592 rb_enc_associate(str, enc);
8593 return str;
8594 }
8595 return Qnil;
8596}
8597
8598
8599/*
8600 * call-seq:
8601 * tr!(selector, replacements) -> self or nil
8602 *
8603 * Like String#tr, but modifies +self+ in place.
8604 * Returns +self+ if any changes were made, +nil+ otherwise.
8605 *
8606 */
8607
8608static VALUE
8609rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8610{
8611 return tr_trans(str, src, repl, 0);
8612}
8613
8614
8615/*
8616 * call-seq:
8617 * tr(selector, replacements) -> new_string
8618 *
8619 * Returns a copy of +self+ with each character specified by string +selector+
8620 * translated to the corresponding character in string +replacements+.
8621 * The correspondence is _positional_:
8622 *
8623 * - Each occurrence of the first character specified by +selector+
8624 * is translated to the first character in +replacements+.
8625 * - Each occurrence of the second character specified by +selector+
8626 * is translated to the second character in +replacements+.
8627 * - And so on.
8628 *
8629 * Example:
8630 *
8631 * 'hello'.tr('el', 'ip') #=> "hippo"
8632 *
8633 * If +replacements+ is shorter than +selector+,
8634 * it is implicitly padded with its own last character:
8635 *
8636 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8637 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8638 *
8639 * Arguments +selector+ and +replacements+ must be valid character selectors
8640 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8641 * and may use any of its valid forms, including negation, ranges, and escaping:
8642 *
8643 * # Negation.
8644 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8645 * # Ranges.
8646 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8647 * # Escapes.
8648 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8649 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8650 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8651 *
8652 */
8653
8654static VALUE
8655rb_str_tr(VALUE str, VALUE src, VALUE repl)
8656{
8657 str = str_duplicate(rb_cString, str);
8658 tr_trans(str, src, repl, 0);
8659 return str;
8660}
8661
8662#define TR_TABLE_MAX (UCHAR_MAX+1)
8663#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8664static void
8665tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8666 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8667{
8668 const unsigned int errc = -1;
8669 char buf[TR_TABLE_MAX];
8670 struct tr tr;
8671 unsigned int c;
8672 VALUE table = 0, ptable = 0;
8673 int i, l, cflag = 0;
8674
8675 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8676 tr.gen = tr.now = tr.max = 0;
8677
8678 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8679 cflag = 1;
8680 tr.p += l;
8681 }
8682 if (first) {
8683 for (i=0; i<TR_TABLE_MAX; i++) {
8684 stable[i] = 1;
8685 }
8686 stable[TR_TABLE_MAX] = cflag;
8687 }
8688 else if (stable[TR_TABLE_MAX] && !cflag) {
8689 stable[TR_TABLE_MAX] = 0;
8690 }
8691 for (i=0; i<TR_TABLE_MAX; i++) {
8692 buf[i] = cflag;
8693 }
8694
8695 while ((c = trnext(&tr, enc)) != errc) {
8696 if (c < TR_TABLE_MAX) {
8697 buf[(unsigned char)c] = !cflag;
8698 }
8699 else {
8700 VALUE key = UINT2NUM(c);
8701
8702 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8703 if (cflag) {
8704 ptable = *ctablep;
8705 table = ptable ? ptable : rb_hash_new();
8706 *ctablep = table;
8707 }
8708 else {
8709 table = rb_hash_new();
8710 ptable = *tablep;
8711 *tablep = table;
8712 }
8713 }
8714 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8715 rb_hash_aset(table, key, Qtrue);
8716 }
8717 }
8718 }
8719 for (i=0; i<TR_TABLE_MAX; i++) {
8720 stable[i] = stable[i] && buf[i];
8721 }
8722 if (!table && !cflag) {
8723 *tablep = 0;
8724 }
8725}
8726
8727
8728static int
8729tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8730{
8731 if (c < TR_TABLE_MAX) {
8732 return table[c] != 0;
8733 }
8734 else {
8735 VALUE v = UINT2NUM(c);
8736
8737 if (del) {
8738 if (!NIL_P(rb_hash_lookup(del, v)) &&
8739 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8740 return TRUE;
8741 }
8742 }
8743 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8744 return FALSE;
8745 }
8746 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8747 }
8748}
8749
8750/*
8751 * call-seq:
8752 * delete!(*selectors) -> self or nil
8753 *
8754 * Like String#delete, but modifies +self+ in place.
8755 * Returns +self+ if any changes were made, +nil+ otherwise.
8756 *
8757 */
8758
8759static VALUE
8760rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8761{
8762 char squeez[TR_TABLE_SIZE];
8763 rb_encoding *enc = 0;
8764 char *s, *send, *t;
8765 VALUE del = 0, nodel = 0;
8766 int modify = 0;
8767 int i, ascompat, cr;
8768
8769 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8771 for (i=0; i<argc; i++) {
8772 VALUE s = argv[i];
8773
8774 StringValue(s);
8775 enc = rb_enc_check(str, s);
8776 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8777 }
8778
8779 str_modify_keep_cr(str);
8780 ascompat = rb_enc_asciicompat(enc);
8781 s = t = RSTRING_PTR(str);
8782 send = RSTRING_END(str);
8783 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8784 while (s < send) {
8785 unsigned int c;
8786 int clen;
8787
8788 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8789 if (squeez[c]) {
8790 modify = 1;
8791 }
8792 else {
8793 if (t != s) *t = c;
8794 t++;
8795 }
8796 s++;
8797 }
8798 else {
8799 c = rb_enc_codepoint_len(s, send, &clen, enc);
8800
8801 if (tr_find(c, squeez, del, nodel)) {
8802 modify = 1;
8803 }
8804 else {
8805 if (t != s) rb_enc_mbcput(c, t, enc);
8806 t += clen;
8808 }
8809 s += clen;
8810 }
8811 }
8812 TERM_FILL(t, TERM_LEN(str));
8813 STR_SET_LEN(str, t - RSTRING_PTR(str));
8814 ENC_CODERANGE_SET(str, cr);
8815
8816 if (modify) return str;
8817 return Qnil;
8818}
8819
8820
8821/*
8822 * call-seq:
8823 * delete(*selectors) -> new_string
8824 *
8825 * Returns a copy of +self+ with characters specified by +selectors+ removed
8826 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8827 *
8828 * "hello".delete "l","lo" #=> "heo"
8829 * "hello".delete "lo" #=> "he"
8830 * "hello".delete "aeiou", "^e" #=> "hell"
8831 * "hello".delete "ej-m" #=> "ho"
8832 *
8833 */
8834
8835static VALUE
8836rb_str_delete(int argc, VALUE *argv, VALUE str)
8837{
8838 str = str_duplicate(rb_cString, str);
8839 rb_str_delete_bang(argc, argv, str);
8840 return str;
8841}
8842
8843
8844/*
8845 * call-seq:
8846 * squeeze!(*selectors) -> self or nil
8847 *
8848 * Like String#squeeze, but modifies +self+ in place.
8849 * Returns +self+ if any changes were made, +nil+ otherwise.
8850 */
8851
8852static VALUE
8853rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8854{
8855 char squeez[TR_TABLE_SIZE];
8856 rb_encoding *enc = 0;
8857 VALUE del = 0, nodel = 0;
8858 unsigned char *s, *send, *t;
8859 int i, modify = 0;
8860 int ascompat, singlebyte = single_byte_optimizable(str);
8861 unsigned int save;
8862
8863 if (argc == 0) {
8864 enc = STR_ENC_GET(str);
8865 }
8866 else {
8867 for (i=0; i<argc; i++) {
8868 VALUE s = argv[i];
8869
8870 StringValue(s);
8871 enc = rb_enc_check(str, s);
8872 if (singlebyte && !single_byte_optimizable(s))
8873 singlebyte = 0;
8874 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8875 }
8876 }
8877
8878 str_modify_keep_cr(str);
8879 s = t = (unsigned char *)RSTRING_PTR(str);
8880 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8881 send = (unsigned char *)RSTRING_END(str);
8882 save = -1;
8883 ascompat = rb_enc_asciicompat(enc);
8884
8885 if (singlebyte) {
8886 while (s < send) {
8887 unsigned int c = *s++;
8888 if (c != save || (argc > 0 && !squeez[c])) {
8889 *t++ = save = c;
8890 }
8891 }
8892 }
8893 else {
8894 while (s < send) {
8895 unsigned int c;
8896 int clen;
8897
8898 if (ascompat && (c = *s) < 0x80) {
8899 if (c != save || (argc > 0 && !squeez[c])) {
8900 *t++ = save = c;
8901 }
8902 s++;
8903 }
8904 else {
8905 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8906
8907 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8908 if (t != s) rb_enc_mbcput(c, t, enc);
8909 save = c;
8910 t += clen;
8911 }
8912 s += clen;
8913 }
8914 }
8915 }
8916
8917 TERM_FILL((char *)t, TERM_LEN(str));
8918 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8919 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8920 modify = 1;
8921 }
8922
8923 if (modify) return str;
8924 return Qnil;
8925}
8926
8927
8928/*
8929 * call-seq:
8930 * squeeze(*selectors) -> new_string
8931 *
8932 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8933 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8934 *
8935 * "Squeezed" means that each multiple-character run of a selected character
8936 * is squeezed down to a single character;
8937 * with no arguments given, squeezes all characters:
8938 *
8939 * "yellow moon".squeeze #=> "yelow mon"
8940 * " now is the".squeeze(" ") #=> " now is the"
8941 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8942 *
8943 */
8944
8945static VALUE
8946rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8947{
8948 str = str_duplicate(rb_cString, str);
8949 rb_str_squeeze_bang(argc, argv, str);
8950 return str;
8951}
8952
8953
8954/*
8955 * call-seq:
8956 * tr_s!(selector, replacements) -> self or nil
8957 *
8958 * Like String#tr_s, but modifies +self+ in place.
8959 * Returns +self+ if any changes were made, +nil+ otherwise.
8960 *
8961 * Related: String#squeeze!.
8962 */
8963
8964static VALUE
8965rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8966{
8967 return tr_trans(str, src, repl, 1);
8968}
8969
8970
8971/*
8972 * call-seq:
8973 * tr_s(selector, replacements) -> string
8974 *
8975 * Like String#tr, but also squeezes the modified portions of the translated string;
8976 * returns a new string (translated and squeezed).
8977 *
8978 * 'hello'.tr_s('l', 'r') #=> "hero"
8979 * 'hello'.tr_s('el', '-') #=> "h-o"
8980 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8981 *
8982 * Related: String#squeeze.
8983 *
8984 */
8985
8986static VALUE
8987rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8988{
8989 str = str_duplicate(rb_cString, str);
8990 tr_trans(str, src, repl, 1);
8991 return str;
8992}
8993
8994
8995/*
8996 * call-seq:
8997 * count(*selectors) -> integer
8998 *
8999 * Returns the total number of characters in +self+
9000 * that are specified by the given +selectors+
9001 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9002 *
9003 * a = "hello world"
9004 * a.count "lo" #=> 5
9005 * a.count "lo", "o" #=> 2
9006 * a.count "hello", "^l" #=> 4
9007 * a.count "ej-m" #=> 4
9008 *
9009 * "hello^world".count "\\^aeiou" #=> 4
9010 * "hello-world".count "a\\-eo" #=> 4
9011 *
9012 * c = "hello world\\r\\n"
9013 * c.count "\\" #=> 2
9014 * c.count "\\A" #=> 0
9015 * c.count "X-\\w" #=> 3
9016 */
9017
9018static VALUE
9019rb_str_count(int argc, VALUE *argv, VALUE str)
9020{
9021 char table[TR_TABLE_SIZE];
9022 rb_encoding *enc = 0;
9023 VALUE del = 0, nodel = 0, tstr;
9024 char *s, *send;
9025 int i;
9026 int ascompat;
9027 size_t n = 0;
9028
9030
9031 tstr = argv[0];
9032 StringValue(tstr);
9033 enc = rb_enc_check(str, tstr);
9034 if (argc == 1) {
9035 const char *ptstr;
9036 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9037 (ptstr = RSTRING_PTR(tstr),
9038 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9039 !is_broken_string(str)) {
9040 int clen;
9041 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9042
9043 s = RSTRING_PTR(str);
9044 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9045 send = RSTRING_END(str);
9046 while (s < send) {
9047 if (*(unsigned char*)s++ == c) n++;
9048 }
9049 return SIZET2NUM(n);
9050 }
9051 }
9052
9053 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9054 for (i=1; i<argc; i++) {
9055 tstr = argv[i];
9056 StringValue(tstr);
9057 enc = rb_enc_check(str, tstr);
9058 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9059 }
9060
9061 s = RSTRING_PTR(str);
9062 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9063 send = RSTRING_END(str);
9064 ascompat = rb_enc_asciicompat(enc);
9065 while (s < send) {
9066 unsigned int c;
9067
9068 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9069 if (table[c]) {
9070 n++;
9071 }
9072 s++;
9073 }
9074 else {
9075 int clen;
9076 c = rb_enc_codepoint_len(s, send, &clen, enc);
9077 if (tr_find(c, table, del, nodel)) {
9078 n++;
9079 }
9080 s += clen;
9081 }
9082 }
9083
9084 return SIZET2NUM(n);
9085}
9086
9087static VALUE
9088rb_fs_check(VALUE val)
9089{
9090 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9091 val = rb_check_string_type(val);
9092 if (NIL_P(val)) return 0;
9093 }
9094 return val;
9095}
9096
9097static const char isspacetable[256] = {
9098 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9100 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9114};
9115
9116#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9117
9118static long
9119split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9120{
9121 if (empty_count >= 0 && len == 0) {
9122 return empty_count + 1;
9123 }
9124 if (empty_count > 0) {
9125 /* make different substrings */
9126 if (result) {
9127 do {
9128 rb_ary_push(result, str_new_empty_String(str));
9129 } while (--empty_count > 0);
9130 }
9131 else {
9132 do {
9133 rb_yield(str_new_empty_String(str));
9134 } while (--empty_count > 0);
9135 }
9136 }
9137 str = rb_str_subseq(str, beg, len);
9138 if (result) {
9139 rb_ary_push(result, str);
9140 }
9141 else {
9142 rb_yield(str);
9143 }
9144 return empty_count;
9145}
9146
9147typedef enum {
9148 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9149} split_type_t;
9150
9151static split_type_t
9152literal_split_pattern(VALUE spat, split_type_t default_type)
9153{
9154 rb_encoding *enc = STR_ENC_GET(spat);
9155 const char *ptr;
9156 long len;
9157 RSTRING_GETMEM(spat, ptr, len);
9158 if (len == 0) {
9159 /* Special case - split into chars */
9160 return SPLIT_TYPE_CHARS;
9161 }
9162 else if (rb_enc_asciicompat(enc)) {
9163 if (len == 1 && ptr[0] == ' ') {
9164 return SPLIT_TYPE_AWK;
9165 }
9166 }
9167 else {
9168 int l;
9169 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9170 return SPLIT_TYPE_AWK;
9171 }
9172 }
9173 return default_type;
9174}
9175
9176/*
9177 * call-seq:
9178 * split(field_sep = $;, limit = 0) -> array
9179 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9180 *
9181 * :include: doc/string/split.rdoc
9182 *
9183 */
9184
9185static VALUE
9186rb_str_split_m(int argc, VALUE *argv, VALUE str)
9187{
9188 rb_encoding *enc;
9189 VALUE spat;
9190 VALUE limit;
9191 split_type_t split_type;
9192 long beg, end, i = 0, empty_count = -1;
9193 int lim = 0;
9194 VALUE result, tmp;
9195
9196 result = rb_block_given_p() ? Qfalse : Qnil;
9197 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9198 lim = NUM2INT(limit);
9199 if (lim <= 0) limit = Qnil;
9200 else if (lim == 1) {
9201 if (RSTRING_LEN(str) == 0)
9202 return result ? rb_ary_new2(0) : str;
9203 tmp = str_duplicate(rb_cString, str);
9204 if (!result) {
9205 rb_yield(tmp);
9206 return str;
9207 }
9208 return rb_ary_new3(1, tmp);
9209 }
9210 i = 1;
9211 }
9212 if (NIL_P(limit) && !lim) empty_count = 0;
9213
9214 enc = STR_ENC_GET(str);
9215 split_type = SPLIT_TYPE_REGEXP;
9216 if (!NIL_P(spat)) {
9217 spat = get_pat_quoted(spat, 0);
9218 }
9219 else if (NIL_P(spat = rb_fs)) {
9220 split_type = SPLIT_TYPE_AWK;
9221 }
9222 else if (!(spat = rb_fs_check(spat))) {
9223 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9224 }
9225 else {
9226 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9227 }
9228 if (split_type != SPLIT_TYPE_AWK) {
9229 switch (BUILTIN_TYPE(spat)) {
9230 case T_REGEXP:
9231 rb_reg_options(spat); /* check if uninitialized */
9232 tmp = RREGEXP_SRC(spat);
9233 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9234 if (split_type == SPLIT_TYPE_AWK) {
9235 spat = tmp;
9236 split_type = SPLIT_TYPE_STRING;
9237 }
9238 break;
9239
9240 case T_STRING:
9241 mustnot_broken(spat);
9242 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9243 break;
9244
9245 default:
9247 }
9248 }
9249
9250#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9251
9252 beg = 0;
9253 char *ptr = RSTRING_PTR(str);
9254 char *eptr = RSTRING_END(str);
9255 if (split_type == SPLIT_TYPE_AWK) {
9256 char *bptr = ptr;
9257 int skip = 1;
9258 unsigned int c;
9259
9260 if (result) result = rb_ary_new();
9261 end = beg;
9262 if (is_ascii_string(str)) {
9263 while (ptr < eptr) {
9264 c = (unsigned char)*ptr++;
9265 if (skip) {
9266 if (ascii_isspace(c)) {
9267 beg = ptr - bptr;
9268 }
9269 else {
9270 end = ptr - bptr;
9271 skip = 0;
9272 if (!NIL_P(limit) && lim <= i) break;
9273 }
9274 }
9275 else if (ascii_isspace(c)) {
9276 SPLIT_STR(beg, end-beg);
9277 skip = 1;
9278 beg = ptr - bptr;
9279 if (!NIL_P(limit)) ++i;
9280 }
9281 else {
9282 end = ptr - bptr;
9283 }
9284 }
9285 }
9286 else {
9287 while (ptr < eptr) {
9288 int n;
9289
9290 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9291 ptr += n;
9292 if (skip) {
9293 if (rb_isspace(c)) {
9294 beg = ptr - bptr;
9295 }
9296 else {
9297 end = ptr - bptr;
9298 skip = 0;
9299 if (!NIL_P(limit) && lim <= i) break;
9300 }
9301 }
9302 else if (rb_isspace(c)) {
9303 SPLIT_STR(beg, end-beg);
9304 skip = 1;
9305 beg = ptr - bptr;
9306 if (!NIL_P(limit)) ++i;
9307 }
9308 else {
9309 end = ptr - bptr;
9310 }
9311 }
9312 }
9313 }
9314 else if (split_type == SPLIT_TYPE_STRING) {
9315 char *str_start = ptr;
9316 char *substr_start = ptr;
9317 char *sptr = RSTRING_PTR(spat);
9318 long slen = RSTRING_LEN(spat);
9319
9320 if (result) result = rb_ary_new();
9321 mustnot_broken(str);
9322 enc = rb_enc_check(str, spat);
9323 while (ptr < eptr &&
9324 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9325 /* Check we are at the start of a char */
9326 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9327 if (t != ptr + end) {
9328 ptr = t;
9329 continue;
9330 }
9331 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9332 ptr += end + slen;
9333 substr_start = ptr;
9334 if (!NIL_P(limit) && lim <= ++i) break;
9335 }
9336 beg = ptr - str_start;
9337 }
9338 else if (split_type == SPLIT_TYPE_CHARS) {
9339 char *str_start = ptr;
9340 int n;
9341
9342 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9343 mustnot_broken(str);
9344 enc = rb_enc_get(str);
9345 while (ptr < eptr &&
9346 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9347 SPLIT_STR(ptr - str_start, n);
9348 ptr += n;
9349 if (!NIL_P(limit) && lim <= ++i) break;
9350 }
9351 beg = ptr - str_start;
9352 }
9353 else {
9354 if (result) result = rb_ary_new();
9355 long len = RSTRING_LEN(str);
9356 long start = beg;
9357 long idx;
9358 int last_null = 0;
9359 struct re_registers *regs;
9360 VALUE match = 0;
9361
9362 for (; rb_reg_search(spat, str, start, 0) >= 0;
9363 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9364 match = rb_backref_get();
9365 if (!result) rb_match_busy(match);
9366 regs = RMATCH_REGS(match);
9367 end = BEG(0);
9368 if (start == end && BEG(0) == END(0)) {
9369 if (!ptr) {
9370 SPLIT_STR(0, 0);
9371 break;
9372 }
9373 else if (last_null == 1) {
9374 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9375 beg = start;
9376 }
9377 else {
9378 if (start == len)
9379 start++;
9380 else
9381 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9382 last_null = 1;
9383 continue;
9384 }
9385 }
9386 else {
9387 SPLIT_STR(beg, end-beg);
9388 beg = start = END(0);
9389 }
9390 last_null = 0;
9391
9392 for (idx=1; idx < regs->num_regs; idx++) {
9393 if (BEG(idx) == -1) continue;
9394 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9395 }
9396 if (!NIL_P(limit) && lim <= ++i) break;
9397 }
9398 if (match) rb_match_unbusy(match);
9399 }
9400 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9401 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9402 }
9403
9404 return result ? result : str;
9405}
9406
9407VALUE
9408rb_str_split(VALUE str, const char *sep0)
9409{
9410 VALUE sep;
9411
9412 StringValue(str);
9413 sep = rb_str_new_cstr(sep0);
9414 return rb_str_split_m(1, &sep, str);
9415}
9416
9417#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9418
9419static inline int
9420enumerator_element(VALUE ary, VALUE e)
9421{
9422 if (ary) {
9423 rb_ary_push(ary, e);
9424 return 0;
9425 }
9426 else {
9427 rb_yield(e);
9428 return 1;
9429 }
9430}
9431
9432#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9433
9434static const char *
9435chomp_newline(const char *p, const char *e, rb_encoding *enc)
9436{
9437 const char *prev = rb_enc_prev_char(p, e, e, enc);
9438 if (rb_enc_is_newline(prev, e, enc)) {
9439 e = prev;
9440 prev = rb_enc_prev_char(p, e, e, enc);
9441 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9442 e = prev;
9443 }
9444 return e;
9445}
9446
9447static VALUE
9448get_rs(void)
9449{
9450 VALUE rs = rb_rs;
9451 if (!NIL_P(rs) &&
9452 (!RB_TYPE_P(rs, T_STRING) ||
9453 RSTRING_LEN(rs) != 1 ||
9454 RSTRING_PTR(rs)[0] != '\n')) {
9455 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9456 }
9457 return rs;
9458}
9459
9460#define rb_rs get_rs()
9461
9462static VALUE
9463rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9464{
9465 rb_encoding *enc;
9466 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9467 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9468 long pos, len, rslen;
9469 int rsnewline = 0;
9470
9471 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9472 rs = rb_rs;
9473 if (!NIL_P(opts)) {
9474 static ID keywords[1];
9475 if (!keywords[0]) {
9476 keywords[0] = rb_intern_const("chomp");
9477 }
9478 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9479 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9480 }
9481
9482 if (NIL_P(rs)) {
9483 if (!ENUM_ELEM(ary, str)) {
9484 return ary;
9485 }
9486 else {
9487 return orig;
9488 }
9489 }
9490
9491 if (!RSTRING_LEN(str)) goto end;
9492 str = rb_str_new_frozen(str);
9493 ptr = subptr = RSTRING_PTR(str);
9494 pend = RSTRING_END(str);
9495 len = RSTRING_LEN(str);
9496 StringValue(rs);
9497 rslen = RSTRING_LEN(rs);
9498
9499 if (rs == rb_default_rs)
9500 enc = rb_enc_get(str);
9501 else
9502 enc = rb_enc_check(str, rs);
9503
9504 if (rslen == 0) {
9505 /* paragraph mode */
9506 int n;
9507 const char *eol = NULL;
9508 subend = subptr;
9509 while (subend < pend) {
9510 long chomp_rslen = 0;
9511 do {
9512 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9513 n = 0;
9514 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9515 if (rb_enc_is_newline(subend + n, pend, enc)) {
9516 if (eol == subend) break;
9517 subend += rslen;
9518 if (subptr) {
9519 eol = subend;
9520 chomp_rslen = -rslen;
9521 }
9522 }
9523 else {
9524 if (!subptr) subptr = subend;
9525 subend += rslen;
9526 }
9527 rslen = 0;
9528 } while (subend < pend);
9529 if (!subptr) break;
9530 if (rslen == 0) chomp_rslen = 0;
9531 line = rb_str_subseq(str, subptr - ptr,
9532 subend - subptr + (chomp ? chomp_rslen : rslen));
9533 if (ENUM_ELEM(ary, line)) {
9534 str_mod_check(str, ptr, len);
9535 }
9536 subptr = eol = NULL;
9537 }
9538 goto end;
9539 }
9540 else {
9541 rsptr = RSTRING_PTR(rs);
9542 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9543 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9544 rsnewline = 1;
9545 }
9546 }
9547
9548 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9549 rs = rb_str_new(rsptr, rslen);
9550 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9551 rsptr = RSTRING_PTR(rs);
9552 rslen = RSTRING_LEN(rs);
9553 }
9554
9555 while (subptr < pend) {
9556 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9557 if (pos < 0) break;
9558 hit = subptr + pos;
9559 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9560 if (hit != adjusted) {
9561 subptr = adjusted;
9562 continue;
9563 }
9564 subend = hit += rslen;
9565 if (chomp) {
9566 if (rsnewline) {
9567 subend = chomp_newline(subptr, subend, enc);
9568 }
9569 else {
9570 subend -= rslen;
9571 }
9572 }
9573 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9574 if (ENUM_ELEM(ary, line)) {
9575 str_mod_check(str, ptr, len);
9576 }
9577 subptr = hit;
9578 }
9579
9580 if (subptr != pend) {
9581 if (chomp) {
9582 if (rsnewline) {
9583 pend = chomp_newline(subptr, pend, enc);
9584 }
9585 else if (pend - subptr >= rslen &&
9586 memcmp(pend - rslen, rsptr, rslen) == 0) {
9587 pend -= rslen;
9588 }
9589 }
9590 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9591 ENUM_ELEM(ary, line);
9592 RB_GC_GUARD(str);
9593 }
9594
9595 end:
9596 if (ary)
9597 return ary;
9598 else
9599 return orig;
9600}
9601
9602/*
9603 * call-seq:
9604 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9605 * each_line(line_sep = $/, chomp: false) -> enumerator
9606 *
9607 * :include: doc/string/each_line.rdoc
9608 *
9609 */
9610
9611static VALUE
9612rb_str_each_line(int argc, VALUE *argv, VALUE str)
9613{
9614 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9615 return rb_str_enumerate_lines(argc, argv, str, 0);
9616}
9617
9618/*
9619 * call-seq:
9620 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9621 *
9622 * Forms substrings ("lines") of +self+ according to the given arguments
9623 * (see String#each_line for details); returns the lines in an array.
9624 *
9625 */
9626
9627static VALUE
9628rb_str_lines(int argc, VALUE *argv, VALUE str)
9629{
9630 VALUE ary = WANTARRAY("lines", 0);
9631 return rb_str_enumerate_lines(argc, argv, str, ary);
9632}
9633
9634static VALUE
9635rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9636{
9637 return LONG2FIX(RSTRING_LEN(str));
9638}
9639
9640static VALUE
9641rb_str_enumerate_bytes(VALUE str, VALUE ary)
9642{
9643 long i;
9644
9645 for (i=0; i<RSTRING_LEN(str); i++) {
9646 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9647 }
9648 if (ary)
9649 return ary;
9650 else
9651 return str;
9652}
9653
9654/*
9655 * call-seq:
9656 * each_byte {|byte| ... } -> self
9657 * each_byte -> enumerator
9658 *
9659 * :include: doc/string/each_byte.rdoc
9660 *
9661 */
9662
9663static VALUE
9664rb_str_each_byte(VALUE str)
9665{
9666 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9667 return rb_str_enumerate_bytes(str, 0);
9668}
9669
9670/*
9671 * call-seq:
9672 * bytes -> array_of_bytes
9673 *
9674 * :include: doc/string/bytes.rdoc
9675 *
9676 */
9677
9678static VALUE
9679rb_str_bytes(VALUE str)
9680{
9681 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9682 return rb_str_enumerate_bytes(str, ary);
9683}
9684
9685static VALUE
9686rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9687{
9688 return rb_str_length(str);
9689}
9690
9691static VALUE
9692rb_str_enumerate_chars(VALUE str, VALUE ary)
9693{
9694 VALUE orig = str;
9695 long i, len, n;
9696 const char *ptr;
9697 rb_encoding *enc;
9698
9699 str = rb_str_new_frozen(str);
9700 ptr = RSTRING_PTR(str);
9701 len = RSTRING_LEN(str);
9702 enc = rb_enc_get(str);
9703
9705 for (i = 0; i < len; i += n) {
9706 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9707 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9708 }
9709 }
9710 else {
9711 for (i = 0; i < len; i += n) {
9712 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9713 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9714 }
9715 }
9716 RB_GC_GUARD(str);
9717 if (ary)
9718 return ary;
9719 else
9720 return orig;
9721}
9722
9723/*
9724 * call-seq:
9725 * each_char {|c| ... } -> self
9726 * each_char -> enumerator
9727 *
9728 * :include: doc/string/each_char.rdoc
9729 *
9730 */
9731
9732static VALUE
9733rb_str_each_char(VALUE str)
9734{
9735 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9736 return rb_str_enumerate_chars(str, 0);
9737}
9738
9739/*
9740 * call-seq:
9741 * chars -> array_of_characters
9742 *
9743 * :include: doc/string/chars.rdoc
9744 *
9745 */
9746
9747static VALUE
9748rb_str_chars(VALUE str)
9749{
9750 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9751 return rb_str_enumerate_chars(str, ary);
9752}
9753
9754static VALUE
9755rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9756{
9757 VALUE orig = str;
9758 int n;
9759 unsigned int c;
9760 const char *ptr, *end;
9761 rb_encoding *enc;
9762
9763 if (single_byte_optimizable(str))
9764 return rb_str_enumerate_bytes(str, ary);
9765
9766 str = rb_str_new_frozen(str);
9767 ptr = RSTRING_PTR(str);
9768 end = RSTRING_END(str);
9769 enc = STR_ENC_GET(str);
9770
9771 while (ptr < end) {
9772 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9773 ENUM_ELEM(ary, UINT2NUM(c));
9774 ptr += n;
9775 }
9776 RB_GC_GUARD(str);
9777 if (ary)
9778 return ary;
9779 else
9780 return orig;
9781}
9782
9783/*
9784 * call-seq:
9785 * each_codepoint {|integer| ... } -> self
9786 * each_codepoint -> enumerator
9787 *
9788 * :include: doc/string/each_codepoint.rdoc
9789 *
9790 */
9791
9792static VALUE
9793rb_str_each_codepoint(VALUE str)
9794{
9795 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9796 return rb_str_enumerate_codepoints(str, 0);
9797}
9798
9799/*
9800 * call-seq:
9801 * codepoints -> array_of_integers
9802 *
9803 * :include: doc/string/codepoints.rdoc
9804 *
9805 */
9806
9807static VALUE
9808rb_str_codepoints(VALUE str)
9809{
9810 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9811 return rb_str_enumerate_codepoints(str, ary);
9812}
9813
9814static regex_t *
9815get_reg_grapheme_cluster(rb_encoding *enc)
9816{
9817 int encidx = rb_enc_to_index(enc);
9818
9819 const OnigUChar source_ascii[] = "\\X";
9820 const OnigUChar *source = source_ascii;
9821 size_t source_len = sizeof(source_ascii) - 1;
9822
9823 switch (encidx) {
9824#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9825#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9826#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9827#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9828#define CASE_UTF(e) \
9829 case ENCINDEX_UTF_##e: { \
9830 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9831 source = source_UTF_##e; \
9832 source_len = sizeof(source_UTF_##e); \
9833 break; \
9834 }
9835 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9836#undef CASE_UTF
9837#undef CHARS_16BE
9838#undef CHARS_16LE
9839#undef CHARS_32BE
9840#undef CHARS_32LE
9841 }
9842
9843 regex_t *reg_grapheme_cluster;
9844 OnigErrorInfo einfo;
9845 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9846 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9847 if (r) {
9848 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9849 onig_error_code_to_str(message, r, &einfo);
9850 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9851 }
9852
9853 return reg_grapheme_cluster;
9854}
9855
9856static regex_t *
9857get_cached_reg_grapheme_cluster(rb_encoding *enc)
9858{
9859 int encidx = rb_enc_to_index(enc);
9860 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9861
9862 if (encidx == rb_utf8_encindex()) {
9863 if (!reg_grapheme_cluster_utf8) {
9864 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9865 }
9866
9867 return reg_grapheme_cluster_utf8;
9868 }
9869
9870 return NULL;
9871}
9872
9873static VALUE
9874rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9875{
9876 size_t grapheme_cluster_count = 0;
9877 rb_encoding *enc = get_encoding(str);
9878 const char *ptr, *end;
9879
9880 if (!rb_enc_unicode_p(enc)) {
9881 return rb_str_length(str);
9882 }
9883
9884 bool cached_reg_grapheme_cluster = true;
9885 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9886 if (!reg_grapheme_cluster) {
9887 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9888 cached_reg_grapheme_cluster = false;
9889 }
9890
9891 ptr = RSTRING_PTR(str);
9892 end = RSTRING_END(str);
9893
9894 while (ptr < end) {
9895 OnigPosition len = onig_match(reg_grapheme_cluster,
9896 (const OnigUChar *)ptr, (const OnigUChar *)end,
9897 (const OnigUChar *)ptr, NULL, 0);
9898 if (len <= 0) break;
9899 grapheme_cluster_count++;
9900 ptr += len;
9901 }
9902
9903 if (!cached_reg_grapheme_cluster) {
9904 onig_free(reg_grapheme_cluster);
9905 }
9906
9907 return SIZET2NUM(grapheme_cluster_count);
9908}
9909
9910static VALUE
9911rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9912{
9913 VALUE orig = str;
9914 rb_encoding *enc = get_encoding(str);
9915 const char *ptr0, *ptr, *end;
9916
9917 if (!rb_enc_unicode_p(enc)) {
9918 return rb_str_enumerate_chars(str, ary);
9919 }
9920
9921 if (!ary) str = rb_str_new_frozen(str);
9922
9923 bool cached_reg_grapheme_cluster = true;
9924 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9925 if (!reg_grapheme_cluster) {
9926 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9927 cached_reg_grapheme_cluster = false;
9928 }
9929
9930 ptr0 = ptr = RSTRING_PTR(str);
9931 end = RSTRING_END(str);
9932
9933 while (ptr < end) {
9934 OnigPosition len = onig_match(reg_grapheme_cluster,
9935 (const OnigUChar *)ptr, (const OnigUChar *)end,
9936 (const OnigUChar *)ptr, NULL, 0);
9937 if (len <= 0) break;
9938 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9939 ptr += len;
9940 }
9941
9942 if (!cached_reg_grapheme_cluster) {
9943 onig_free(reg_grapheme_cluster);
9944 }
9945
9946 RB_GC_GUARD(str);
9947 if (ary)
9948 return ary;
9949 else
9950 return orig;
9951}
9952
9953/*
9954 * call-seq:
9955 * each_grapheme_cluster {|gc| ... } -> self
9956 * each_grapheme_cluster -> enumerator
9957 *
9958 * :include: doc/string/each_grapheme_cluster.rdoc
9959 *
9960 */
9961
9962static VALUE
9963rb_str_each_grapheme_cluster(VALUE str)
9964{
9965 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9966 return rb_str_enumerate_grapheme_clusters(str, 0);
9967}
9968
9969/*
9970 * call-seq:
9971 * grapheme_clusters -> array_of_grapheme_clusters
9972 *
9973 * :include: doc/string/grapheme_clusters.rdoc
9974 *
9975 */
9976
9977static VALUE
9978rb_str_grapheme_clusters(VALUE str)
9979{
9980 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9981 return rb_str_enumerate_grapheme_clusters(str, ary);
9982}
9983
9984static long
9985chopped_length(VALUE str)
9986{
9987 rb_encoding *enc = STR_ENC_GET(str);
9988 const char *p, *p2, *beg, *end;
9989
9990 beg = RSTRING_PTR(str);
9991 end = beg + RSTRING_LEN(str);
9992 if (beg >= end) return 0;
9993 p = rb_enc_prev_char(beg, end, end, enc);
9994 if (!p) return 0;
9995 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9996 p2 = rb_enc_prev_char(beg, p, end, enc);
9997 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9998 }
9999 return p - beg;
10000}
10001
10002/*
10003 * call-seq:
10004 * chop! -> self or nil
10005 *
10006 * Like String#chop, but modifies +self+ in place;
10007 * returns +nil+ if +self+ is empty, +self+ otherwise.
10008 *
10009 * Related: String#chomp!.
10010 */
10011
10012static VALUE
10013rb_str_chop_bang(VALUE str)
10014{
10015 str_modify_keep_cr(str);
10016 if (RSTRING_LEN(str) > 0) {
10017 long len;
10018 len = chopped_length(str);
10019 STR_SET_LEN(str, len);
10020 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10021 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10023 }
10024 return str;
10025 }
10026 return Qnil;
10027}
10028
10029
10030/*
10031 * call-seq:
10032 * chop -> new_string
10033 *
10034 * :include: doc/string/chop.rdoc
10035 *
10036 */
10037
10038static VALUE
10039rb_str_chop(VALUE str)
10040{
10041 return rb_str_subseq(str, 0, chopped_length(str));
10042}
10043
10044static long
10045smart_chomp(VALUE str, const char *e, const char *p)
10046{
10047 rb_encoding *enc = rb_enc_get(str);
10048 if (rb_enc_mbminlen(enc) > 1) {
10049 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10050 if (rb_enc_is_newline(pp, e, enc)) {
10051 e = pp;
10052 }
10053 pp = e - rb_enc_mbminlen(enc);
10054 if (pp >= p) {
10055 pp = rb_enc_left_char_head(p, pp, e, enc);
10056 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10057 e = pp;
10058 }
10059 }
10060 }
10061 else {
10062 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10063 case '\n':
10064 if (--e > p && *(e-1) == '\r') {
10065 --e;
10066 }
10067 break;
10068 case '\r':
10069 --e;
10070 break;
10071 }
10072 }
10073 return e - p;
10074}
10075
10076static long
10077chompped_length(VALUE str, VALUE rs)
10078{
10079 rb_encoding *enc;
10080 int newline;
10081 char *pp, *e, *rsptr;
10082 long rslen;
10083 char *const p = RSTRING_PTR(str);
10084 long len = RSTRING_LEN(str);
10085
10086 if (len == 0) return 0;
10087 e = p + len;
10088 if (rs == rb_default_rs) {
10089 return smart_chomp(str, e, p);
10090 }
10091
10092 enc = rb_enc_get(str);
10093 RSTRING_GETMEM(rs, rsptr, rslen);
10094 if (rslen == 0) {
10095 if (rb_enc_mbminlen(enc) > 1) {
10096 while (e > p) {
10097 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10098 if (!rb_enc_is_newline(pp, e, enc)) break;
10099 e = pp;
10100 pp -= rb_enc_mbminlen(enc);
10101 if (pp >= p) {
10102 pp = rb_enc_left_char_head(p, pp, e, enc);
10103 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10104 e = pp;
10105 }
10106 }
10107 }
10108 }
10109 else {
10110 while (e > p && *(e-1) == '\n') {
10111 --e;
10112 if (e > p && *(e-1) == '\r')
10113 --e;
10114 }
10115 }
10116 return e - p;
10117 }
10118 if (rslen > len) return len;
10119
10120 enc = rb_enc_get(rs);
10121 newline = rsptr[rslen-1];
10122 if (rslen == rb_enc_mbminlen(enc)) {
10123 if (rslen == 1) {
10124 if (newline == '\n')
10125 return smart_chomp(str, e, p);
10126 }
10127 else {
10128 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10129 return smart_chomp(str, e, p);
10130 }
10131 }
10132
10133 enc = rb_enc_check(str, rs);
10134 if (is_broken_string(rs)) {
10135 return len;
10136 }
10137 pp = e - rslen;
10138 if (p[len-1] == newline &&
10139 (rslen <= 1 ||
10140 memcmp(rsptr, pp, rslen) == 0)) {
10141 if (at_char_boundary(p, pp, e, enc))
10142 return len - rslen;
10143 RB_GC_GUARD(rs);
10144 }
10145 return len;
10146}
10147
10153static VALUE
10154chomp_rs(int argc, const VALUE *argv)
10155{
10156 rb_check_arity(argc, 0, 1);
10157 if (argc > 0) {
10158 VALUE rs = argv[0];
10159 if (!NIL_P(rs)) StringValue(rs);
10160 return rs;
10161 }
10162 else {
10163 return rb_rs;
10164 }
10165}
10166
10167VALUE
10168rb_str_chomp_string(VALUE str, VALUE rs)
10169{
10170 long olen = RSTRING_LEN(str);
10171 long len = chompped_length(str, rs);
10172 if (len >= olen) return Qnil;
10173 str_modify_keep_cr(str);
10174 STR_SET_LEN(str, len);
10175 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10176 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10178 }
10179 return str;
10180}
10181
10182/*
10183 * call-seq:
10184 * chomp!(line_sep = $/) -> self or nil
10185 *
10186 * Like String#chomp, but modifies +self+ in place;
10187 * returns +nil+ if no modification made, +self+ otherwise.
10188 *
10189 */
10190
10191static VALUE
10192rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10193{
10194 VALUE rs;
10195 str_modifiable(str);
10196 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10197 rs = chomp_rs(argc, argv);
10198 if (NIL_P(rs)) return Qnil;
10199 return rb_str_chomp_string(str, rs);
10200}
10201
10202
10203/*
10204 * call-seq:
10205 * chomp(line_sep = $/) -> new_string
10206 *
10207 * :include: doc/string/chomp.rdoc
10208 *
10209 */
10210
10211static VALUE
10212rb_str_chomp(int argc, VALUE *argv, VALUE str)
10213{
10214 VALUE rs = chomp_rs(argc, argv);
10215 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10216 return rb_str_subseq(str, 0, chompped_length(str, rs));
10217}
10218
10219static long
10220lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10221{
10222 const char *const start = s;
10223
10224 if (!s || s >= e) return 0;
10225
10226 /* remove spaces at head */
10227 if (single_byte_optimizable(str)) {
10228 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10229 }
10230 else {
10231 while (s < e) {
10232 int n;
10233 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10234
10235 if (cc && !rb_isspace(cc)) break;
10236 s += n;
10237 }
10238 }
10239 return s - start;
10240}
10241
10242/*
10243 * call-seq:
10244 * lstrip! -> self or nil
10245 *
10246 * Like String#lstrip, except that any modifications are made in +self+;
10247 * returns +self+ if any modification are made, +nil+ otherwise.
10248 *
10249 * Related: String#rstrip!, String#strip!.
10250 */
10251
10252static VALUE
10253rb_str_lstrip_bang(VALUE str)
10254{
10255 rb_encoding *enc;
10256 char *start, *s;
10257 long olen, loffset;
10258
10259 str_modify_keep_cr(str);
10260 enc = STR_ENC_GET(str);
10261 RSTRING_GETMEM(str, start, olen);
10262 loffset = lstrip_offset(str, start, start+olen, enc);
10263 if (loffset > 0) {
10264 long len = olen-loffset;
10265 s = start + loffset;
10266 memmove(start, s, len);
10267 STR_SET_LEN(str, len);
10268 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10269 return str;
10270 }
10271 return Qnil;
10272}
10273
10274
10275/*
10276 * call-seq:
10277 * lstrip -> new_string
10278 *
10279 * Returns a copy of +self+ with leading whitespace removed;
10280 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10281 *
10282 * whitespace = "\x00\t\n\v\f\r "
10283 * s = whitespace + 'abc' + whitespace
10284 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10285 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10286 *
10287 * Related: String#rstrip, String#strip.
10288 */
10289
10290static VALUE
10291rb_str_lstrip(VALUE str)
10292{
10293 char *start;
10294 long len, loffset;
10295 RSTRING_GETMEM(str, start, len);
10296 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10297 if (loffset <= 0) return str_duplicate(rb_cString, str);
10298 return rb_str_subseq(str, loffset, len - loffset);
10299}
10300
10301static long
10302rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10303{
10304 const char *t;
10305
10306 rb_str_check_dummy_enc(enc);
10308 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10309 }
10310 if (!s || s >= e) return 0;
10311 t = e;
10312
10313 /* remove trailing spaces or '\0's */
10314 if (single_byte_optimizable(str)) {
10315 unsigned char c;
10316 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10317 }
10318 else {
10319 char *tp;
10320
10321 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10322 unsigned int c = rb_enc_codepoint(tp, e, enc);
10323 if (c && !rb_isspace(c)) break;
10324 t = tp;
10325 }
10326 }
10327 return e - t;
10328}
10329
10330/*
10331 * call-seq:
10332 * rstrip! -> self or nil
10333 *
10334 * Like String#rstrip, except that any modifications are made in +self+;
10335 * returns +self+ if any modification are made, +nil+ otherwise.
10336 *
10337 * Related: String#lstrip!, String#strip!.
10338 */
10339
10340static VALUE
10341rb_str_rstrip_bang(VALUE str)
10342{
10343 rb_encoding *enc;
10344 char *start;
10345 long olen, roffset;
10346
10347 str_modify_keep_cr(str);
10348 enc = STR_ENC_GET(str);
10349 RSTRING_GETMEM(str, start, olen);
10350 roffset = rstrip_offset(str, start, start+olen, enc);
10351 if (roffset > 0) {
10352 long len = olen - roffset;
10353
10354 STR_SET_LEN(str, len);
10355 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10356 return str;
10357 }
10358 return Qnil;
10359}
10360
10361
10362/*
10363 * call-seq:
10364 * rstrip -> new_string
10365 *
10366 * Returns a copy of the receiver with trailing whitespace removed;
10367 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10368 *
10369 * whitespace = "\x00\t\n\v\f\r "
10370 * s = whitespace + 'abc' + whitespace
10371 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10372 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10373 *
10374 * Related: String#lstrip, String#strip.
10375 */
10376
10377static VALUE
10378rb_str_rstrip(VALUE str)
10379{
10380 rb_encoding *enc;
10381 char *start;
10382 long olen, roffset;
10383
10384 enc = STR_ENC_GET(str);
10385 RSTRING_GETMEM(str, start, olen);
10386 roffset = rstrip_offset(str, start, start+olen, enc);
10387
10388 if (roffset <= 0) return str_duplicate(rb_cString, str);
10389 return rb_str_subseq(str, 0, olen-roffset);
10390}
10391
10392
10393/*
10394 * call-seq:
10395 * strip! -> self or nil
10396 *
10397 * Like String#strip, except that any modifications are made in +self+;
10398 * returns +self+ if any modification are made, +nil+ otherwise.
10399 *
10400 * Related: String#lstrip!, String#strip!.
10401 */
10402
10403static VALUE
10404rb_str_strip_bang(VALUE str)
10405{
10406 char *start;
10407 long olen, loffset, roffset;
10408 rb_encoding *enc;
10409
10410 str_modify_keep_cr(str);
10411 enc = STR_ENC_GET(str);
10412 RSTRING_GETMEM(str, start, olen);
10413 loffset = lstrip_offset(str, start, start+olen, enc);
10414 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10415
10416 if (loffset > 0 || roffset > 0) {
10417 long len = olen-roffset;
10418 if (loffset > 0) {
10419 len -= loffset;
10420 memmove(start, start + loffset, len);
10421 }
10422 STR_SET_LEN(str, len);
10423 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10424 return str;
10425 }
10426 return Qnil;
10427}
10428
10429
10430/*
10431 * call-seq:
10432 * strip -> new_string
10433 *
10434 * Returns a copy of the receiver with leading and trailing whitespace removed;
10435 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10436 *
10437 * whitespace = "\x00\t\n\v\f\r "
10438 * s = whitespace + 'abc' + whitespace
10439 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10440 * s.strip # => "abc"
10441 *
10442 * Related: String#lstrip, String#rstrip.
10443 */
10444
10445static VALUE
10446rb_str_strip(VALUE str)
10447{
10448 char *start;
10449 long olen, loffset, roffset;
10450 rb_encoding *enc = STR_ENC_GET(str);
10451
10452 RSTRING_GETMEM(str, start, olen);
10453 loffset = lstrip_offset(str, start, start+olen, enc);
10454 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10455
10456 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10457 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10458}
10459
10460static VALUE
10461scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10462{
10463 VALUE result = Qnil;
10464 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10465 if (pos >= 0) {
10466 VALUE match;
10467 struct re_registers *regs;
10468 if (BUILTIN_TYPE(pat) == T_STRING) {
10469 regs = NULL;
10470 end = pos + RSTRING_LEN(pat);
10471 }
10472 else {
10473 match = rb_backref_get();
10474 regs = RMATCH_REGS(match);
10475 pos = BEG(0);
10476 end = END(0);
10477 }
10478
10479 if (pos == end) {
10480 rb_encoding *enc = STR_ENC_GET(str);
10481 /*
10482 * Always consume at least one character of the input string
10483 */
10484 if (RSTRING_LEN(str) > end)
10485 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10486 RSTRING_END(str), enc);
10487 else
10488 *start = end + 1;
10489 }
10490 else {
10491 *start = end;
10492 }
10493
10494 if (!regs || regs->num_regs == 1) {
10495 result = rb_str_subseq(str, pos, end - pos);
10496 return result;
10497 }
10498 else {
10499 result = rb_ary_new2(regs->num_regs);
10500 for (int i = 1; i < regs->num_regs; i++) {
10501 VALUE s = Qnil;
10502 if (BEG(i) >= 0) {
10503 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10504 }
10505
10506 rb_ary_push(result, s);
10507 }
10508 }
10509
10510 RB_GC_GUARD(match);
10511 }
10512
10513 return result;
10514}
10515
10516
10517/*
10518 * call-seq:
10519 * scan(string_or_regexp) -> array
10520 * scan(string_or_regexp) {|matches| ... } -> self
10521 *
10522 * Matches a pattern against +self+; the pattern is:
10523 *
10524 * - +string_or_regexp+ itself, if it is a Regexp.
10525 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10526 *
10527 * Iterates through +self+, generating a collection of matching results:
10528 *
10529 * - If the pattern contains no groups, each result is the
10530 * matched string, <code>$&</code>.
10531 * - If the pattern contains groups, each result is an array
10532 * containing one entry per group.
10533 *
10534 * With no block given, returns an array of the results:
10535 *
10536 * s = 'cruel world'
10537 * s.scan(/\w+/) # => ["cruel", "world"]
10538 * s.scan(/.../) # => ["cru", "el ", "wor"]
10539 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10540 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10541 *
10542 * With a block given, calls the block with each result; returns +self+:
10543 *
10544 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10545 * print "\n"
10546 * s.scan(/(.)(.)/) {|x,y| print y, x }
10547 * print "\n"
10548 *
10549 * Output:
10550 *
10551 * <<cruel>> <<world>>
10552 * rceu lowlr
10553 *
10554 */
10555
10556static VALUE
10557rb_str_scan(VALUE str, VALUE pat)
10558{
10559 VALUE result;
10560 long start = 0;
10561 long last = -1, prev = 0;
10562 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10563
10564 pat = get_pat_quoted(pat, 1);
10565 mustnot_broken(str);
10566 if (!rb_block_given_p()) {
10567 VALUE ary = rb_ary_new();
10568
10569 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10570 last = prev;
10571 prev = start;
10572 rb_ary_push(ary, result);
10573 }
10574 if (last >= 0) rb_pat_search(pat, str, last, 1);
10575 else rb_backref_set(Qnil);
10576 return ary;
10577 }
10578
10579 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10580 last = prev;
10581 prev = start;
10582 rb_yield(result);
10583 str_mod_check(str, p, len);
10584 }
10585 if (last >= 0) rb_pat_search(pat, str, last, 1);
10586 return str;
10587}
10588
10589
10590/*
10591 * call-seq:
10592 * hex -> integer
10593 *
10594 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10595 * (with an optional sign and an optional <code>0x</code>) and returns the
10596 * corresponding number;
10597 * returns zero if there is no such leading substring:
10598 *
10599 * '0x0a'.hex # => 10
10600 * '-1234'.hex # => -4660
10601 * '0'.hex # => 0
10602 * 'non-numeric'.hex # => 0
10603 *
10604 * Related: String#oct.
10605 *
10606 */
10607
10608static VALUE
10609rb_str_hex(VALUE str)
10610{
10611 return rb_str_to_inum(str, 16, FALSE);
10612}
10613
10614
10615/*
10616 * call-seq:
10617 * oct -> integer
10618 *
10619 * Interprets the leading substring of +self+ as a string of octal digits
10620 * (with an optional sign) and returns the corresponding number;
10621 * returns zero if there is no such leading substring:
10622 *
10623 * '123'.oct # => 83
10624 * '-377'.oct # => -255
10625 * '0377non-numeric'.oct # => 255
10626 * 'non-numeric'.oct # => 0
10627 *
10628 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10629 * see Kernel#Integer.
10630 *
10631 * Related: String#hex.
10632 *
10633 */
10634
10635static VALUE
10636rb_str_oct(VALUE str)
10637{
10638 return rb_str_to_inum(str, -8, FALSE);
10639}
10640
10641#ifndef HAVE_CRYPT_R
10642# include "ruby/thread_native.h"
10643# include "ruby/atomic.h"
10644
10645static struct {
10646 rb_nativethread_lock_t lock;
10647} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10648
10649static void
10650crypt_mutex_initialize(void)
10651{
10652}
10653#endif
10654
10655/*
10656 * call-seq:
10657 * crypt(salt_str) -> new_string
10658 *
10659 * Returns the string generated by calling <code>crypt(3)</code>
10660 * standard library function with <code>str</code> and
10661 * <code>salt_str</code>, in this order, as its arguments. Please do
10662 * not use this method any longer. It is legacy; provided only for
10663 * backward compatibility with ruby scripts in earlier days. It is
10664 * bad to use in contemporary programs for several reasons:
10665 *
10666 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10667 * run. The generated string lacks data portability.
10668 *
10669 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10670 * (i.e. silently ends up in unexpected results).
10671 *
10672 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10673 * thread safe.
10674 *
10675 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10676 * very very weak. According to its manpage, Linux's traditional
10677 * <code>crypt(3)</code> output has only 2**56 variations; too
10678 * easy to brute force today. And this is the default behaviour.
10679 *
10680 * * In order to make things robust some OSes implement so-called
10681 * "modular" usage. To go through, you have to do a complex
10682 * build-up of the <code>salt_str</code> parameter, by hand.
10683 * Failure in generation of a proper salt string tends not to
10684 * yield any errors; typos in parameters are normally not
10685 * detectable.
10686 *
10687 * * For instance, in the following example, the second invocation
10688 * of String#crypt is wrong; it has a typo in "round=" (lacks
10689 * "s"). However the call does not fail and something unexpected
10690 * is generated.
10691 *
10692 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10693 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10694 *
10695 * * Even in the "modular" mode, some hash functions are considered
10696 * archaic and no longer recommended at all; for instance module
10697 * <code>$1$</code> is officially abandoned by its author: see
10698 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10699 * instance module <code>$3$</code> is considered completely
10700 * broken: see the manpage of FreeBSD.
10701 *
10702 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10703 * written above, <code>crypt(3)</code> on Mac OS never fails.
10704 * This means even if you build up a proper salt string it
10705 * generates a traditional DES hash anyways, and there is no way
10706 * for you to be aware of.
10707 *
10708 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10709 *
10710 * If for some reason you cannot migrate to other secure contemporary
10711 * password hashing algorithms, install the string-crypt gem and
10712 * <code>require 'string/crypt'</code> to continue using it.
10713 */
10714
10715static VALUE
10716rb_str_crypt(VALUE str, VALUE salt)
10717{
10718#ifdef HAVE_CRYPT_R
10719 VALUE databuf;
10720 struct crypt_data *data;
10721# define CRYPT_END() ALLOCV_END(databuf)
10722#else
10723 extern char *crypt(const char *, const char *);
10724# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10725#endif
10726 VALUE result;
10727 const char *s, *saltp;
10728 char *res;
10729#ifdef BROKEN_CRYPT
10730 char salt_8bit_clean[3];
10731#endif
10732
10733 StringValue(salt);
10734 mustnot_wchar(str);
10735 mustnot_wchar(salt);
10736 s = StringValueCStr(str);
10737 saltp = RSTRING_PTR(salt);
10738 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10739 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10740 }
10741
10742#ifdef BROKEN_CRYPT
10743 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10744 salt_8bit_clean[0] = saltp[0] & 0x7f;
10745 salt_8bit_clean[1] = saltp[1] & 0x7f;
10746 salt_8bit_clean[2] = '\0';
10747 saltp = salt_8bit_clean;
10748 }
10749#endif
10750#ifdef HAVE_CRYPT_R
10751 data = ALLOCV(databuf, sizeof(struct crypt_data));
10752# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10753 data->initialized = 0;
10754# endif
10755 res = crypt_r(s, saltp, data);
10756#else
10757 crypt_mutex_initialize();
10758 rb_nativethread_lock_lock(&crypt_mutex.lock);
10759 res = crypt(s, saltp);
10760#endif
10761 if (!res) {
10762 int err = errno;
10763 CRYPT_END();
10764 rb_syserr_fail(err, "crypt");
10765 }
10766 result = rb_str_new_cstr(res);
10767 CRYPT_END();
10768 return result;
10769}
10770
10771
10772/*
10773 * call-seq:
10774 * ord -> integer
10775 *
10776 * :include: doc/string/ord.rdoc
10777 *
10778 */
10779
10780static VALUE
10781rb_str_ord(VALUE s)
10782{
10783 unsigned int c;
10784
10785 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10786 return UINT2NUM(c);
10787}
10788/*
10789 * call-seq:
10790 * sum(n = 16) -> integer
10791 *
10792 * :include: doc/string/sum.rdoc
10793 *
10794 */
10795
10796static VALUE
10797rb_str_sum(int argc, VALUE *argv, VALUE str)
10798{
10799 int bits = 16;
10800 char *ptr, *p, *pend;
10801 long len;
10802 VALUE sum = INT2FIX(0);
10803 unsigned long sum0 = 0;
10804
10805 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10806 bits = 0;
10807 }
10808 ptr = p = RSTRING_PTR(str);
10809 len = RSTRING_LEN(str);
10810 pend = p + len;
10811
10812 while (p < pend) {
10813 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10814 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10815 str_mod_check(str, ptr, len);
10816 sum0 = 0;
10817 }
10818 sum0 += (unsigned char)*p;
10819 p++;
10820 }
10821
10822 if (bits == 0) {
10823 if (sum0) {
10824 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10825 }
10826 }
10827 else {
10828 if (sum == INT2FIX(0)) {
10829 if (bits < (int)sizeof(long)*CHAR_BIT) {
10830 sum0 &= (((unsigned long)1)<<bits)-1;
10831 }
10832 sum = LONG2FIX(sum0);
10833 }
10834 else {
10835 VALUE mod;
10836
10837 if (sum0) {
10838 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10839 }
10840
10841 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10842 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10843 sum = rb_funcall(sum, '&', 1, mod);
10844 }
10845 }
10846 return sum;
10847}
10848
10849static VALUE
10850rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10851{
10852 rb_encoding *enc;
10853 VALUE w;
10854 long width, len, flen = 1, fclen = 1;
10855 VALUE res;
10856 char *p;
10857 const char *f = " ";
10858 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10859 VALUE pad;
10860 int singlebyte = 1, cr;
10861 int termlen;
10862
10863 rb_scan_args(argc, argv, "11", &w, &pad);
10864 enc = STR_ENC_GET(str);
10865 termlen = rb_enc_mbminlen(enc);
10866 width = NUM2LONG(w);
10867 if (argc == 2) {
10868 StringValue(pad);
10869 enc = rb_enc_check(str, pad);
10870 f = RSTRING_PTR(pad);
10871 flen = RSTRING_LEN(pad);
10872 fclen = str_strlen(pad, enc); /* rb_enc_check */
10873 singlebyte = single_byte_optimizable(pad);
10874 if (flen == 0 || fclen == 0) {
10875 rb_raise(rb_eArgError, "zero width padding");
10876 }
10877 }
10878 len = str_strlen(str, enc); /* rb_enc_check */
10879 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10880 n = width - len;
10881 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10882 rlen = n - llen;
10883 cr = ENC_CODERANGE(str);
10884 if (flen > 1) {
10885 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10886 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10887 }
10888 size = RSTRING_LEN(str);
10889 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10890 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10891 (len += llen2 + rlen2) >= LONG_MAX - size) {
10892 rb_raise(rb_eArgError, "argument too big");
10893 }
10894 len += size;
10895 res = str_enc_new(rb_cString, 0, len, enc);
10896 p = RSTRING_PTR(res);
10897 if (flen <= 1) {
10898 memset(p, *f, llen);
10899 p += llen;
10900 }
10901 else {
10902 while (llen >= fclen) {
10903 memcpy(p,f,flen);
10904 p += flen;
10905 llen -= fclen;
10906 }
10907 if (llen > 0) {
10908 memcpy(p, f, llen2);
10909 p += llen2;
10910 }
10911 }
10912 memcpy(p, RSTRING_PTR(str), size);
10913 p += size;
10914 if (flen <= 1) {
10915 memset(p, *f, rlen);
10916 p += rlen;
10917 }
10918 else {
10919 while (rlen >= fclen) {
10920 memcpy(p,f,flen);
10921 p += flen;
10922 rlen -= fclen;
10923 }
10924 if (rlen > 0) {
10925 memcpy(p, f, rlen2);
10926 p += rlen2;
10927 }
10928 }
10929 TERM_FILL(p, termlen);
10930 STR_SET_LEN(res, p-RSTRING_PTR(res));
10931
10932 if (argc == 2)
10933 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10934 if (cr != ENC_CODERANGE_BROKEN)
10935 ENC_CODERANGE_SET(res, cr);
10936
10937 RB_GC_GUARD(pad);
10938 return res;
10939}
10940
10941
10942/*
10943 * call-seq:
10944 * ljust(size, pad_string = ' ') -> new_string
10945 *
10946 * :include: doc/string/ljust.rdoc
10947 *
10948 * Related: String#rjust, String#center.
10949 *
10950 */
10951
10952static VALUE
10953rb_str_ljust(int argc, VALUE *argv, VALUE str)
10954{
10955 return rb_str_justify(argc, argv, str, 'l');
10956}
10957
10958/*
10959 * call-seq:
10960 * rjust(size, pad_string = ' ') -> new_string
10961 *
10962 * :include: doc/string/rjust.rdoc
10963 *
10964 * Related: String#ljust, String#center.
10965 *
10966 */
10967
10968static VALUE
10969rb_str_rjust(int argc, VALUE *argv, VALUE str)
10970{
10971 return rb_str_justify(argc, argv, str, 'r');
10972}
10973
10974
10975/*
10976 * call-seq:
10977 * center(size, pad_string = ' ') -> new_string
10978 *
10979 * :include: doc/string/center.rdoc
10980 *
10981 * Related: String#ljust, String#rjust.
10982 *
10983 */
10984
10985static VALUE
10986rb_str_center(int argc, VALUE *argv, VALUE str)
10987{
10988 return rb_str_justify(argc, argv, str, 'c');
10989}
10990
10991/*
10992 * call-seq:
10993 * partition(string_or_regexp) -> [head, match, tail]
10994 *
10995 * :include: doc/string/partition.rdoc
10996 *
10997 */
10998
10999static VALUE
11000rb_str_partition(VALUE str, VALUE sep)
11001{
11002 long pos;
11003
11004 sep = get_pat_quoted(sep, 0);
11005 if (RB_TYPE_P(sep, T_REGEXP)) {
11006 if (rb_reg_search(sep, str, 0, 0) < 0) {
11007 goto failed;
11008 }
11009 VALUE match = rb_backref_get();
11010 struct re_registers *regs = RMATCH_REGS(match);
11011
11012 pos = BEG(0);
11013 sep = rb_str_subseq(str, pos, END(0) - pos);
11014 }
11015 else {
11016 pos = rb_str_index(str, sep, 0);
11017 if (pos < 0) goto failed;
11018 }
11019 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11020 sep,
11021 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11022 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11023
11024 failed:
11025 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11026}
11027
11028/*
11029 * call-seq:
11030 * rpartition(sep) -> [head, match, tail]
11031 *
11032 * :include: doc/string/rpartition.rdoc
11033 *
11034 */
11035
11036static VALUE
11037rb_str_rpartition(VALUE str, VALUE sep)
11038{
11039 long pos = RSTRING_LEN(str);
11040
11041 sep = get_pat_quoted(sep, 0);
11042 if (RB_TYPE_P(sep, T_REGEXP)) {
11043 if (rb_reg_search(sep, str, pos, 1) < 0) {
11044 goto failed;
11045 }
11046 VALUE match = rb_backref_get();
11047 struct re_registers *regs = RMATCH_REGS(match);
11048
11049 pos = BEG(0);
11050 sep = rb_str_subseq(str, pos, END(0) - pos);
11051 }
11052 else {
11053 pos = rb_str_sublen(str, pos);
11054 pos = rb_str_rindex(str, sep, pos);
11055 if (pos < 0) {
11056 goto failed;
11057 }
11058 }
11059
11060 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11061 sep,
11062 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11063 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11064 failed:
11065 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11066}
11067
11068/*
11069 * call-seq:
11070 * start_with?(*string_or_regexp) -> true or false
11071 *
11072 * :include: doc/string/start_with_p.rdoc
11073 *
11074 */
11075
11076static VALUE
11077rb_str_start_with(int argc, VALUE *argv, VALUE str)
11078{
11079 int i;
11080
11081 for (i=0; i<argc; i++) {
11082 VALUE tmp = argv[i];
11083 if (RB_TYPE_P(tmp, T_REGEXP)) {
11084 if (rb_reg_start_with_p(tmp, str))
11085 return Qtrue;
11086 }
11087 else {
11088 const char *p, *s, *e;
11089 long slen, tlen;
11090 rb_encoding *enc;
11091
11092 StringValue(tmp);
11093 enc = rb_enc_check(str, tmp);
11094 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11095 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11096 p = RSTRING_PTR(str);
11097 e = p + slen;
11098 s = p + tlen;
11099 if (!at_char_right_boundary(p, s, e, enc))
11100 continue;
11101 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11102 return Qtrue;
11103 }
11104 }
11105 return Qfalse;
11106}
11107
11108/*
11109 * call-seq:
11110 * end_with?(*strings) -> true or false
11111 *
11112 * :include: doc/string/end_with_p.rdoc
11113 *
11114 */
11115
11116static VALUE
11117rb_str_end_with(int argc, VALUE *argv, VALUE str)
11118{
11119 int i;
11120
11121 for (i=0; i<argc; i++) {
11122 VALUE tmp = argv[i];
11123 const char *p, *s, *e;
11124 long slen, tlen;
11125 rb_encoding *enc;
11126
11127 StringValue(tmp);
11128 enc = rb_enc_check(str, tmp);
11129 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11130 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11131 p = RSTRING_PTR(str);
11132 e = p + slen;
11133 s = e - tlen;
11134 if (!at_char_boundary(p, s, e, enc))
11135 continue;
11136 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11137 return Qtrue;
11138 }
11139 return Qfalse;
11140}
11141
11151static long
11152deleted_prefix_length(VALUE str, VALUE prefix)
11153{
11154 const char *strptr, *prefixptr;
11155 long olen, prefixlen;
11156 rb_encoding *enc = rb_enc_get(str);
11157
11158 StringValue(prefix);
11159
11160 if (!is_broken_string(prefix) ||
11161 !rb_enc_asciicompat(enc) ||
11162 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11163 enc = rb_enc_check(str, prefix);
11164 }
11165
11166 /* return 0 if not start with prefix */
11167 prefixlen = RSTRING_LEN(prefix);
11168 if (prefixlen <= 0) return 0;
11169 olen = RSTRING_LEN(str);
11170 if (olen < prefixlen) return 0;
11171 strptr = RSTRING_PTR(str);
11172 prefixptr = RSTRING_PTR(prefix);
11173 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11174 if (is_broken_string(prefix)) {
11175 if (!is_broken_string(str)) {
11176 /* prefix in a valid string cannot be broken */
11177 return 0;
11178 }
11179 const char *strend = strptr + olen;
11180 const char *after_prefix = strptr + prefixlen;
11181 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11182 /* prefix does not end at char-boundary */
11183 return 0;
11184 }
11185 }
11186 /* prefix part in `str` also should be valid. */
11187
11188 return prefixlen;
11189}
11190
11191/*
11192 * call-seq:
11193 * delete_prefix!(prefix) -> self or nil
11194 *
11195 * Like String#delete_prefix, except that +self+ is modified in place.
11196 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11197 *
11198 */
11199
11200static VALUE
11201rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11202{
11203 long prefixlen;
11204 str_modify_keep_cr(str);
11205
11206 prefixlen = deleted_prefix_length(str, prefix);
11207 if (prefixlen <= 0) return Qnil;
11208
11209 return rb_str_drop_bytes(str, prefixlen);
11210}
11211
11212/*
11213 * call-seq:
11214 * delete_prefix(prefix) -> new_string
11215 *
11216 * :include: doc/string/delete_prefix.rdoc
11217 *
11218 */
11219
11220static VALUE
11221rb_str_delete_prefix(VALUE str, VALUE prefix)
11222{
11223 long prefixlen;
11224
11225 prefixlen = deleted_prefix_length(str, prefix);
11226 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11227
11228 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11229}
11230
11240static long
11241deleted_suffix_length(VALUE str, VALUE suffix)
11242{
11243 const char *strptr, *suffixptr;
11244 long olen, suffixlen;
11245 rb_encoding *enc;
11246
11247 StringValue(suffix);
11248 if (is_broken_string(suffix)) return 0;
11249 enc = rb_enc_check(str, suffix);
11250
11251 /* return 0 if not start with suffix */
11252 suffixlen = RSTRING_LEN(suffix);
11253 if (suffixlen <= 0) return 0;
11254 olen = RSTRING_LEN(str);
11255 if (olen < suffixlen) return 0;
11256 strptr = RSTRING_PTR(str);
11257 suffixptr = RSTRING_PTR(suffix);
11258 const char *strend = strptr + olen;
11259 const char *before_suffix = strend - suffixlen;
11260 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11261 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11262
11263 return suffixlen;
11264}
11265
11266/*
11267 * call-seq:
11268 * delete_suffix!(suffix) -> self or nil
11269 *
11270 * Like String#delete_suffix, except that +self+ is modified in place.
11271 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11272 *
11273 */
11274
11275static VALUE
11276rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11277{
11278 long olen, suffixlen, len;
11279 str_modifiable(str);
11280
11281 suffixlen = deleted_suffix_length(str, suffix);
11282 if (suffixlen <= 0) return Qnil;
11283
11284 olen = RSTRING_LEN(str);
11285 str_modify_keep_cr(str);
11286 len = olen - suffixlen;
11287 STR_SET_LEN(str, len);
11288 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11289 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11291 }
11292 return str;
11293}
11294
11295/*
11296 * call-seq:
11297 * delete_suffix(suffix) -> new_string
11298 *
11299 * :include: doc/string/delete_suffix.rdoc
11300 *
11301 */
11302
11303static VALUE
11304rb_str_delete_suffix(VALUE str, VALUE suffix)
11305{
11306 long suffixlen;
11307
11308 suffixlen = deleted_suffix_length(str, suffix);
11309 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11310
11311 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11312}
11313
11314void
11315rb_str_setter(VALUE val, ID id, VALUE *var)
11316{
11317 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11318 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11319 }
11320 *var = val;
11321}
11322
11323static void
11324rb_fs_setter(VALUE val, ID id, VALUE *var)
11325{
11326 val = rb_fs_check(val);
11327 if (!val) {
11328 rb_raise(rb_eTypeError,
11329 "value of %"PRIsVALUE" must be String or Regexp",
11330 rb_id2str(id));
11331 }
11332 if (!NIL_P(val)) {
11333 rb_warn_deprecated("'$;'", NULL);
11334 }
11335 *var = val;
11336}
11337
11338
11339/*
11340 * call-seq:
11341 * force_encoding(encoding) -> self
11342 *
11343 * :include: doc/string/force_encoding.rdoc
11344 *
11345 */
11346
11347static VALUE
11348rb_str_force_encoding(VALUE str, VALUE enc)
11349{
11350 str_modifiable(str);
11351
11352 rb_encoding *encoding = rb_to_encoding(enc);
11353 int idx = rb_enc_to_index(encoding);
11354
11355 // If the encoding is unchanged, we do nothing.
11356 if (ENCODING_GET(str) == idx) {
11357 return str;
11358 }
11359
11360 rb_enc_associate_index(str, idx);
11361
11362 // If the coderange was 7bit and the new encoding is ASCII-compatible
11363 // we can keep the coderange.
11364 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11365 return str;
11366 }
11367
11369 return str;
11370}
11371
11372/*
11373 * call-seq:
11374 * b -> string
11375 *
11376 * :include: doc/string/b.rdoc
11377 *
11378 */
11379
11380static VALUE
11381rb_str_b(VALUE str)
11382{
11383 VALUE str2;
11384 if (STR_EMBED_P(str)) {
11385 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11386 }
11387 else {
11388 str2 = str_alloc_heap(rb_cString);
11389 }
11390 str_replace_shared_without_enc(str2, str);
11391
11392 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11393 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11394 // If we know the receiver's code range then we know the result's code range.
11395 int cr = ENC_CODERANGE(str);
11396 switch (cr) {
11397 case ENC_CODERANGE_7BIT:
11399 break;
11403 break;
11404 default:
11405 ENC_CODERANGE_CLEAR(str2);
11406 break;
11407 }
11408 }
11409
11410 return str2;
11411}
11412
11413/*
11414 * call-seq:
11415 * valid_encoding? -> true or false
11416 *
11417 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11418 *
11419 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11420 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
11421 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
11422 */
11423
11424static VALUE
11425rb_str_valid_encoding_p(VALUE str)
11426{
11427 int cr = rb_enc_str_coderange(str);
11428
11429 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11430}
11431
11432/*
11433 * call-seq:
11434 * ascii_only? -> true or false
11435 *
11436 * Returns +true+ if +self+ contains only ASCII characters,
11437 * +false+ otherwise:
11438 *
11439 * 'abc'.ascii_only? # => true
11440 * "abc\u{6666}".ascii_only? # => false
11441 *
11442 */
11443
11444static VALUE
11445rb_str_is_ascii_only_p(VALUE str)
11446{
11447 int cr = rb_enc_str_coderange(str);
11448
11449 return RBOOL(cr == ENC_CODERANGE_7BIT);
11450}
11451
11452VALUE
11454{
11455 static const char ellipsis[] = "...";
11456 const long ellipsislen = sizeof(ellipsis) - 1;
11457 rb_encoding *const enc = rb_enc_get(str);
11458 const long blen = RSTRING_LEN(str);
11459 const char *const p = RSTRING_PTR(str), *e = p + blen;
11460 VALUE estr, ret = 0;
11461
11462 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11463 if (len * rb_enc_mbminlen(enc) >= blen ||
11464 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11465 ret = str;
11466 }
11467 else if (len <= ellipsislen ||
11468 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11469 if (rb_enc_asciicompat(enc)) {
11470 ret = rb_str_new(ellipsis, len);
11471 rb_enc_associate(ret, enc);
11472 }
11473 else {
11474 estr = rb_usascii_str_new(ellipsis, len);
11475 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11476 }
11477 }
11478 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11479 rb_str_cat(ret, ellipsis, ellipsislen);
11480 }
11481 else {
11482 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11483 rb_enc_from_encoding(enc), 0, Qnil);
11484 rb_str_append(ret, estr);
11485 }
11486 return ret;
11487}
11488
11489static VALUE
11490str_compat_and_valid(VALUE str, rb_encoding *enc)
11491{
11492 int cr;
11493 str = StringValue(str);
11494 cr = rb_enc_str_coderange(str);
11495 if (cr == ENC_CODERANGE_BROKEN) {
11496 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11497 }
11498 else {
11499 rb_encoding *e = STR_ENC_GET(str);
11500 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11501 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11502 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11503 }
11504 }
11505 return str;
11506}
11507
11508static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11509
11510VALUE
11512{
11513 rb_encoding *enc = STR_ENC_GET(str);
11514 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11515}
11516
11517VALUE
11518rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11519{
11520 int cr = ENC_CODERANGE_UNKNOWN;
11521 if (enc == STR_ENC_GET(str)) {
11522 /* cached coderange makes sense only when enc equals the
11523 * actual encoding of str */
11524 cr = ENC_CODERANGE(str);
11525 }
11526 return enc_str_scrub(enc, str, repl, cr);
11527}
11528
11529static VALUE
11530enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11531{
11532 int encidx;
11533 VALUE buf = Qnil;
11534 const char *rep, *p, *e, *p1, *sp;
11535 long replen = -1;
11536 long slen;
11537
11538 if (rb_block_given_p()) {
11539 if (!NIL_P(repl))
11540 rb_raise(rb_eArgError, "both of block and replacement given");
11541 replen = 0;
11542 }
11543
11544 if (ENC_CODERANGE_CLEAN_P(cr))
11545 return Qnil;
11546
11547 if (!NIL_P(repl)) {
11548 repl = str_compat_and_valid(repl, enc);
11549 }
11550
11551 if (rb_enc_dummy_p(enc)) {
11552 return Qnil;
11553 }
11554 encidx = rb_enc_to_index(enc);
11555
11556#define DEFAULT_REPLACE_CHAR(str) do { \
11557 static const char replace[sizeof(str)-1] = str; \
11558 rep = replace; replen = (int)sizeof(replace); \
11559 } while (0)
11560
11561 slen = RSTRING_LEN(str);
11562 p = RSTRING_PTR(str);
11563 e = RSTRING_END(str);
11564 p1 = p;
11565 sp = p;
11566
11567 if (rb_enc_asciicompat(enc)) {
11568 int rep7bit_p;
11569 if (!replen) {
11570 rep = NULL;
11571 rep7bit_p = FALSE;
11572 }
11573 else if (!NIL_P(repl)) {
11574 rep = RSTRING_PTR(repl);
11575 replen = RSTRING_LEN(repl);
11576 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11577 }
11578 else if (encidx == rb_utf8_encindex()) {
11579 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11580 rep7bit_p = FALSE;
11581 }
11582 else {
11583 DEFAULT_REPLACE_CHAR("?");
11584 rep7bit_p = TRUE;
11585 }
11586 cr = ENC_CODERANGE_7BIT;
11587
11588 p = search_nonascii(p, e);
11589 if (!p) {
11590 p = e;
11591 }
11592 while (p < e) {
11593 int ret = rb_enc_precise_mbclen(p, e, enc);
11594 if (MBCLEN_NEEDMORE_P(ret)) {
11595 break;
11596 }
11597 else if (MBCLEN_CHARFOUND_P(ret)) {
11599 p += MBCLEN_CHARFOUND_LEN(ret);
11600 }
11601 else if (MBCLEN_INVALID_P(ret)) {
11602 /*
11603 * p1~p: valid ascii/multibyte chars
11604 * p ~e: invalid bytes + unknown bytes
11605 */
11606 long clen = rb_enc_mbmaxlen(enc);
11607 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11608 if (p > p1) {
11609 rb_str_buf_cat(buf, p1, p - p1);
11610 }
11611
11612 if (e - p < clen) clen = e - p;
11613 if (clen <= 2) {
11614 clen = 1;
11615 }
11616 else {
11617 const char *q = p;
11618 clen--;
11619 for (; clen > 1; clen--) {
11620 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11621 if (MBCLEN_NEEDMORE_P(ret)) break;
11622 if (MBCLEN_INVALID_P(ret)) continue;
11624 }
11625 }
11626 if (rep) {
11627 rb_str_buf_cat(buf, rep, replen);
11628 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11629 }
11630 else {
11631 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11632 str_mod_check(str, sp, slen);
11633 repl = str_compat_and_valid(repl, enc);
11634 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11637 }
11638 p += clen;
11639 p1 = p;
11640 p = search_nonascii(p, e);
11641 if (!p) {
11642 p = e;
11643 break;
11644 }
11645 }
11646 else {
11648 }
11649 }
11650 if (NIL_P(buf)) {
11651 if (p == e) {
11652 ENC_CODERANGE_SET(str, cr);
11653 return Qnil;
11654 }
11655 buf = rb_str_buf_new(RSTRING_LEN(str));
11656 }
11657 if (p1 < p) {
11658 rb_str_buf_cat(buf, p1, p - p1);
11659 }
11660 if (p < e) {
11661 if (rep) {
11662 rb_str_buf_cat(buf, rep, replen);
11663 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11664 }
11665 else {
11666 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11667 str_mod_check(str, sp, slen);
11668 repl = str_compat_and_valid(repl, enc);
11669 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11672 }
11673 }
11674 }
11675 else {
11676 /* ASCII incompatible */
11677 long mbminlen = rb_enc_mbminlen(enc);
11678 if (!replen) {
11679 rep = NULL;
11680 }
11681 else if (!NIL_P(repl)) {
11682 rep = RSTRING_PTR(repl);
11683 replen = RSTRING_LEN(repl);
11684 }
11685 else if (encidx == ENCINDEX_UTF_16BE) {
11686 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11687 }
11688 else if (encidx == ENCINDEX_UTF_16LE) {
11689 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11690 }
11691 else if (encidx == ENCINDEX_UTF_32BE) {
11692 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11693 }
11694 else if (encidx == ENCINDEX_UTF_32LE) {
11695 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11696 }
11697 else {
11698 DEFAULT_REPLACE_CHAR("?");
11699 }
11700
11701 while (p < e) {
11702 int ret = rb_enc_precise_mbclen(p, e, enc);
11703 if (MBCLEN_NEEDMORE_P(ret)) {
11704 break;
11705 }
11706 else if (MBCLEN_CHARFOUND_P(ret)) {
11707 p += MBCLEN_CHARFOUND_LEN(ret);
11708 }
11709 else if (MBCLEN_INVALID_P(ret)) {
11710 const char *q = p;
11711 long clen = rb_enc_mbmaxlen(enc);
11712 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11713 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11714
11715 if (e - p < clen) clen = e - p;
11716 if (clen <= mbminlen * 2) {
11717 clen = mbminlen;
11718 }
11719 else {
11720 clen -= mbminlen;
11721 for (; clen > mbminlen; clen-=mbminlen) {
11722 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11723 if (MBCLEN_NEEDMORE_P(ret)) break;
11724 if (MBCLEN_INVALID_P(ret)) continue;
11726 }
11727 }
11728 if (rep) {
11729 rb_str_buf_cat(buf, rep, replen);
11730 }
11731 else {
11732 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11733 str_mod_check(str, sp, slen);
11734 repl = str_compat_and_valid(repl, enc);
11735 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11736 }
11737 p += clen;
11738 p1 = p;
11739 }
11740 else {
11742 }
11743 }
11744 if (NIL_P(buf)) {
11745 if (p == e) {
11747 return Qnil;
11748 }
11749 buf = rb_str_buf_new(RSTRING_LEN(str));
11750 }
11751 if (p1 < p) {
11752 rb_str_buf_cat(buf, p1, p - p1);
11753 }
11754 if (p < e) {
11755 if (rep) {
11756 rb_str_buf_cat(buf, rep, replen);
11757 }
11758 else {
11759 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11760 str_mod_check(str, sp, slen);
11761 repl = str_compat_and_valid(repl, enc);
11762 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11763 }
11764 }
11766 }
11767 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11768 return buf;
11769}
11770
11771/*
11772 * call-seq:
11773 * scrub(replacement_string = default_replacement) -> new_string
11774 * scrub{|bytes| ... } -> new_string
11775 *
11776 * :include: doc/string/scrub.rdoc
11777 *
11778 */
11779static VALUE
11780str_scrub(int argc, VALUE *argv, VALUE str)
11781{
11782 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11783 VALUE new = rb_str_scrub(str, repl);
11784 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11785}
11786
11787/*
11788 * call-seq:
11789 * scrub! -> self
11790 * scrub!(replacement_string = default_replacement) -> self
11791 * scrub!{|bytes| ... } -> self
11792 *
11793 * Like String#scrub, except that any replacements are made in +self+.
11794 *
11795 */
11796static VALUE
11797str_scrub_bang(int argc, VALUE *argv, VALUE str)
11798{
11799 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11800 VALUE new = rb_str_scrub(str, repl);
11801 if (!NIL_P(new)) rb_str_replace(str, new);
11802 return str;
11803}
11804
11805static ID id_normalize;
11806static ID id_normalized_p;
11807static VALUE mUnicodeNormalize;
11808
11809static VALUE
11810unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11811{
11812 static int UnicodeNormalizeRequired = 0;
11813 VALUE argv2[2];
11814
11815 if (!UnicodeNormalizeRequired) {
11816 rb_require("unicode_normalize/normalize.rb");
11817 UnicodeNormalizeRequired = 1;
11818 }
11819 argv2[0] = str;
11820 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11821 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11822}
11823
11824/*
11825 * call-seq:
11826 * unicode_normalize(form = :nfc) -> string
11827 *
11828 * Returns a copy of +self+ with
11829 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11830 *
11831 * Argument +form+ must be one of the following symbols
11832 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11833 *
11834 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11835 * - +:nfd+: Canonical decomposition.
11836 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11837 * - +:nfkd+: Compatibility decomposition.
11838 *
11839 * The encoding of +self+ must be one of:
11840 *
11841 * - Encoding::UTF_8
11842 * - Encoding::UTF_16BE
11843 * - Encoding::UTF_16LE
11844 * - Encoding::UTF_32BE
11845 * - Encoding::UTF_32LE
11846 * - Encoding::GB18030
11847 * - Encoding::UCS_2BE
11848 * - Encoding::UCS_4BE
11849 *
11850 * Examples:
11851 *
11852 * "a\u0300".unicode_normalize # => "a"
11853 * "\u00E0".unicode_normalize(:nfd) # => "a "
11854 *
11855 * Related: String#unicode_normalize!, String#unicode_normalized?.
11856 */
11857static VALUE
11858rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11859{
11860 return unicode_normalize_common(argc, argv, str, id_normalize);
11861}
11862
11863/*
11864 * call-seq:
11865 * unicode_normalize!(form = :nfc) -> self
11866 *
11867 * Like String#unicode_normalize, except that the normalization
11868 * is performed on +self+.
11869 *
11870 * Related String#unicode_normalized?.
11871 *
11872 */
11873static VALUE
11874rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11875{
11876 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11877}
11878
11879/* call-seq:
11880 * unicode_normalized?(form = :nfc) -> true or false
11881 *
11882 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11883 * +false+ otherwise.
11884 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11885 *
11886 * Examples:
11887 *
11888 * "a\u0300".unicode_normalized? # => false
11889 * "a\u0300".unicode_normalized?(:nfd) # => true
11890 * "\u00E0".unicode_normalized? # => true
11891 * "\u00E0".unicode_normalized?(:nfd) # => false
11892 *
11893 *
11894 * Raises an exception if +self+ is not in a Unicode encoding:
11895 *
11896 * s = "\xE0".force_encoding('ISO-8859-1')
11897 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11898 *
11899 * Related: String#unicode_normalize, String#unicode_normalize!.
11900 *
11901 */
11902static VALUE
11903rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11904{
11905 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11906}
11907
11908/**********************************************************************
11909 * Document-class: Symbol
11910 *
11911 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11912 *
11913 * You can create a +Symbol+ object explicitly with:
11914 *
11915 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11916 *
11917 * The same +Symbol+ object will be
11918 * created for a given name or string for the duration of a program's
11919 * execution, regardless of the context or meaning of that name. Thus
11920 * if <code>Fred</code> is a constant in one context, a method in
11921 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11922 * will be the same object in all three contexts.
11923 *
11924 * module One
11925 * class Fred
11926 * end
11927 * $f1 = :Fred
11928 * end
11929 * module Two
11930 * Fred = 1
11931 * $f2 = :Fred
11932 * end
11933 * def Fred()
11934 * end
11935 * $f3 = :Fred
11936 * $f1.object_id #=> 2514190
11937 * $f2.object_id #=> 2514190
11938 * $f3.object_id #=> 2514190
11939 *
11940 * Constant, method, and variable names are returned as symbols:
11941 *
11942 * module One
11943 * Two = 2
11944 * def three; 3 end
11945 * @four = 4
11946 * @@five = 5
11947 * $six = 6
11948 * end
11949 * seven = 7
11950 *
11951 * One.constants
11952 * # => [:Two]
11953 * One.instance_methods(true)
11954 * # => [:three]
11955 * One.instance_variables
11956 * # => [:@four]
11957 * One.class_variables
11958 * # => [:@@five]
11959 * global_variables.grep(/six/)
11960 * # => [:$six]
11961 * local_variables
11962 * # => [:seven]
11963 *
11964 * A +Symbol+ object differs from a String object in that
11965 * a +Symbol+ object represents an identifier, while a String object
11966 * represents text or data.
11967 *
11968 * == What's Here
11969 *
11970 * First, what's elsewhere. \Class +Symbol+:
11971 *
11972 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11973 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11974 *
11975 * Here, class +Symbol+ provides methods that are useful for:
11976 *
11977 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11978 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11979 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11980 *
11981 * === Methods for Querying
11982 *
11983 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11984 * - #=~: Returns the index of the first substring in symbol that matches a
11985 * given Regexp or other object; returns +nil+ if no match is found.
11986 * - #[], #slice : Returns a substring of symbol
11987 * determined by a given index, start/length, or range, or string.
11988 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11989 * - #encoding: Returns the Encoding object that represents the encoding
11990 * of symbol.
11991 * - #end_with?: Returns +true+ if symbol ends with
11992 * any of the given strings.
11993 * - #match: Returns a MatchData object if symbol
11994 * matches a given Regexp; +nil+ otherwise.
11995 * - #match?: Returns +true+ if symbol
11996 * matches a given Regexp; +false+ otherwise.
11997 * - #length, #size: Returns the number of characters in symbol.
11998 * - #start_with?: Returns +true+ if symbol starts with
11999 * any of the given strings.
12000 *
12001 * === Methods for Comparing
12002 *
12003 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12004 * or larger than symbol.
12005 * - #==, #===: Returns +true+ if a given symbol has the same content and
12006 * encoding.
12007 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12008 * symbol is smaller than, equal to, or larger than symbol.
12009 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12010 * after Unicode case folding; +false+ otherwise.
12011 *
12012 * === Methods for Converting
12013 *
12014 * - #capitalize: Returns symbol with the first character upcased
12015 * and all other characters downcased.
12016 * - #downcase: Returns symbol with all characters downcased.
12017 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12018 * - #name: Returns the frozen string corresponding to symbol.
12019 * - #succ, #next: Returns the symbol that is the successor to symbol.
12020 * - #swapcase: Returns symbol with all upcase characters downcased
12021 * and all downcase characters upcased.
12022 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12023 * - #to_s, #id2name: Returns the string corresponding to +self+.
12024 * - #to_sym, #intern: Returns +self+.
12025 * - #upcase: Returns symbol with all characters upcased.
12026 *
12027 */
12028
12029
12030/*
12031 * call-seq:
12032 * symbol == object -> true or false
12033 *
12034 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12035 */
12036
12037#define sym_equal rb_obj_equal
12038
12039static int
12040sym_printable(const char *s, const char *send, rb_encoding *enc)
12041{
12042 while (s < send) {
12043 int n;
12044 int c = rb_enc_precise_mbclen(s, send, enc);
12045
12046 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12047 n = MBCLEN_CHARFOUND_LEN(c);
12048 c = rb_enc_mbc_to_codepoint(s, send, enc);
12049 if (!rb_enc_isprint(c, enc)) return FALSE;
12050 s += n;
12051 }
12052 return TRUE;
12053}
12054
12055int
12056rb_str_symname_p(VALUE sym)
12057{
12058 rb_encoding *enc;
12059 const char *ptr;
12060 long len;
12061 rb_encoding *resenc = rb_default_internal_encoding();
12062
12063 if (resenc == NULL) resenc = rb_default_external_encoding();
12064 enc = STR_ENC_GET(sym);
12065 ptr = RSTRING_PTR(sym);
12066 len = RSTRING_LEN(sym);
12067 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12068 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12069 return FALSE;
12070 }
12071 return TRUE;
12072}
12073
12074VALUE
12075rb_str_quote_unprintable(VALUE str)
12076{
12077 rb_encoding *enc;
12078 const char *ptr;
12079 long len;
12080 rb_encoding *resenc;
12081
12082 Check_Type(str, T_STRING);
12083 resenc = rb_default_internal_encoding();
12084 if (resenc == NULL) resenc = rb_default_external_encoding();
12085 enc = STR_ENC_GET(str);
12086 ptr = RSTRING_PTR(str);
12087 len = RSTRING_LEN(str);
12088 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12089 !sym_printable(ptr, ptr + len, enc)) {
12090 return rb_str_escape(str);
12091 }
12092 return str;
12093}
12094
12095VALUE
12096rb_id_quote_unprintable(ID id)
12097{
12098 VALUE str = rb_id2str(id);
12099 if (!rb_str_symname_p(str)) {
12100 return rb_str_escape(str);
12101 }
12102 return str;
12103}
12104
12105/*
12106 * call-seq:
12107 * inspect -> string
12108 *
12109 * Returns a string representation of +self+ (including the leading colon):
12110 *
12111 * :foo.inspect # => ":foo"
12112 *
12113 * Related: Symbol#to_s, Symbol#name.
12114 *
12115 */
12116
12117static VALUE
12118sym_inspect(VALUE sym)
12119{
12120 VALUE str = rb_sym2str(sym);
12121 const char *ptr;
12122 long len;
12123 char *dest;
12124
12125 if (!rb_str_symname_p(str)) {
12126 str = rb_str_inspect(str);
12127 len = RSTRING_LEN(str);
12128 rb_str_resize(str, len + 1);
12129 dest = RSTRING_PTR(str);
12130 memmove(dest + 1, dest, len);
12131 }
12132 else {
12133 rb_encoding *enc = STR_ENC_GET(str);
12134 VALUE orig_str = str;
12135
12136 len = RSTRING_LEN(orig_str);
12137 str = rb_enc_str_new(0, len + 1, enc);
12138
12139 // Get data pointer after allocation
12140 ptr = RSTRING_PTR(orig_str);
12141 dest = RSTRING_PTR(str);
12142 memcpy(dest + 1, ptr, len);
12143
12144 RB_GC_GUARD(orig_str);
12145 }
12146 dest[0] = ':';
12147
12149
12150 return str;
12151}
12152
12153VALUE
12155{
12156 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12157 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12158 return str;
12159}
12160
12161VALUE
12162rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12163{
12164 VALUE obj;
12165
12166 if (argc < 1) {
12167 rb_raise(rb_eArgError, "no receiver given");
12168 }
12169 obj = argv[0];
12170 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12171}
12172
12173/*
12174 * call-seq:
12175 * succ
12176 *
12177 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12178 *
12179 * :foo.succ # => :fop
12180 *
12181 * Related: String#succ.
12182 */
12183
12184static VALUE
12185sym_succ(VALUE sym)
12186{
12187 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12188}
12189
12190/*
12191 * call-seq:
12192 * symbol <=> object -> -1, 0, +1, or nil
12193 *
12194 * If +object+ is a symbol,
12195 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12196 *
12197 * :bar <=> :foo # => -1
12198 * :foo <=> :foo # => 0
12199 * :foo <=> :bar # => 1
12200 *
12201 * Otherwise, returns +nil+:
12202 *
12203 * :foo <=> 'bar' # => nil
12204 *
12205 * Related: String#<=>.
12206 */
12207
12208static VALUE
12209sym_cmp(VALUE sym, VALUE other)
12210{
12211 if (!SYMBOL_P(other)) {
12212 return Qnil;
12213 }
12214 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12215}
12216
12217/*
12218 * call-seq:
12219 * casecmp(object) -> -1, 0, 1, or nil
12220 *
12221 * :include: doc/symbol/casecmp.rdoc
12222 *
12223 */
12224
12225static VALUE
12226sym_casecmp(VALUE sym, VALUE other)
12227{
12228 if (!SYMBOL_P(other)) {
12229 return Qnil;
12230 }
12231 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12232}
12233
12234/*
12235 * call-seq:
12236 * casecmp?(object) -> true, false, or nil
12237 *
12238 * :include: doc/symbol/casecmp_p.rdoc
12239 *
12240 */
12241
12242static VALUE
12243sym_casecmp_p(VALUE sym, VALUE other)
12244{
12245 if (!SYMBOL_P(other)) {
12246 return Qnil;
12247 }
12248 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12249}
12250
12251/*
12252 * call-seq:
12253 * symbol =~ object -> integer or nil
12254 *
12255 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12256 * including possible updates to global variables;
12257 * see String#=~.
12258 *
12259 */
12260
12261static VALUE
12262sym_match(VALUE sym, VALUE other)
12263{
12264 return rb_str_match(rb_sym2str(sym), other);
12265}
12266
12267/*
12268 * call-seq:
12269 * match(pattern, offset = 0) -> matchdata or nil
12270 * match(pattern, offset = 0) {|matchdata| } -> object
12271 *
12272 * Equivalent to <tt>self.to_s.match</tt>,
12273 * including possible updates to global variables;
12274 * see String#match.
12275 *
12276 */
12277
12278static VALUE
12279sym_match_m(int argc, VALUE *argv, VALUE sym)
12280{
12281 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12282}
12283
12284/*
12285 * call-seq:
12286 * match?(pattern, offset) -> true or false
12287 *
12288 * Equivalent to <tt>sym.to_s.match?</tt>;
12289 * see String#match.
12290 *
12291 */
12292
12293static VALUE
12294sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12295{
12296 return rb_str_match_m_p(argc, argv, sym);
12297}
12298
12299/*
12300 * call-seq:
12301 * symbol[index] -> string or nil
12302 * symbol[start, length] -> string or nil
12303 * symbol[range] -> string or nil
12304 * symbol[regexp, capture = 0] -> string or nil
12305 * symbol[substring] -> string or nil
12306 *
12307 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12308 *
12309 */
12310
12311static VALUE
12312sym_aref(int argc, VALUE *argv, VALUE sym)
12313{
12314 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12315}
12316
12317/*
12318 * call-seq:
12319 * length -> integer
12320 *
12321 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12322 */
12323
12324static VALUE
12325sym_length(VALUE sym)
12326{
12327 return rb_str_length(rb_sym2str(sym));
12328}
12329
12330/*
12331 * call-seq:
12332 * empty? -> true or false
12333 *
12334 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12335 *
12336 */
12337
12338static VALUE
12339sym_empty(VALUE sym)
12340{
12341 return rb_str_empty(rb_sym2str(sym));
12342}
12343
12344/*
12345 * call-seq:
12346 * upcase(*options) -> symbol
12347 *
12348 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12349 *
12350 * See String#upcase.
12351 *
12352 */
12353
12354static VALUE
12355sym_upcase(int argc, VALUE *argv, VALUE sym)
12356{
12357 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12358}
12359
12360/*
12361 * call-seq:
12362 * downcase(*options) -> symbol
12363 *
12364 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12365 *
12366 * See String#downcase.
12367 *
12368 * Related: Symbol#upcase.
12369 *
12370 */
12371
12372static VALUE
12373sym_downcase(int argc, VALUE *argv, VALUE sym)
12374{
12375 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12376}
12377
12378/*
12379 * call-seq:
12380 * capitalize(*options) -> symbol
12381 *
12382 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12383 *
12384 * See String#capitalize.
12385 *
12386 */
12387
12388static VALUE
12389sym_capitalize(int argc, VALUE *argv, VALUE sym)
12390{
12391 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12392}
12393
12394/*
12395 * call-seq:
12396 * swapcase(*options) -> symbol
12397 *
12398 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12399 *
12400 * See String#swapcase.
12401 *
12402 */
12403
12404static VALUE
12405sym_swapcase(int argc, VALUE *argv, VALUE sym)
12406{
12407 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12408}
12409
12410/*
12411 * call-seq:
12412 * start_with?(*string_or_regexp) -> true or false
12413 *
12414 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12415 *
12416 */
12417
12418static VALUE
12419sym_start_with(int argc, VALUE *argv, VALUE sym)
12420{
12421 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12422}
12423
12424/*
12425 * call-seq:
12426 * end_with?(*strings) -> true or false
12427 *
12428 *
12429 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12430 *
12431 */
12432
12433static VALUE
12434sym_end_with(int argc, VALUE *argv, VALUE sym)
12435{
12436 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12437}
12438
12439/*
12440 * call-seq:
12441 * encoding -> encoding
12442 *
12443 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12444 *
12445 */
12446
12447static VALUE
12448sym_encoding(VALUE sym)
12449{
12450 return rb_obj_encoding(rb_sym2str(sym));
12451}
12452
12453static VALUE
12454string_for_symbol(VALUE name)
12455{
12456 if (!RB_TYPE_P(name, T_STRING)) {
12457 VALUE tmp = rb_check_string_type(name);
12458 if (NIL_P(tmp)) {
12459 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12460 name);
12461 }
12462 name = tmp;
12463 }
12464 return name;
12465}
12466
12467ID
12469{
12470 if (SYMBOL_P(name)) {
12471 return SYM2ID(name);
12472 }
12473 name = string_for_symbol(name);
12474 return rb_intern_str(name);
12475}
12476
12477VALUE
12479{
12480 if (SYMBOL_P(name)) {
12481 return name;
12482 }
12483 name = string_for_symbol(name);
12484 return rb_str_intern(name);
12485}
12486
12487/*
12488 * call-seq:
12489 * Symbol.all_symbols -> array_of_symbols
12490 *
12491 * Returns an array of all symbols currently in Ruby's symbol table:
12492 *
12493 * Symbol.all_symbols.size # => 9334
12494 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12495 *
12496 */
12497
12498static VALUE
12499sym_all_symbols(VALUE _)
12500{
12501 return rb_sym_all_symbols();
12502}
12503
12504VALUE
12505rb_str_to_interned_str(VALUE str)
12506{
12507 return rb_fstring(str);
12508}
12509
12510VALUE
12511rb_interned_str(const char *ptr, long len)
12512{
12513 struct RString fake_str;
12514 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12515}
12516
12517VALUE
12518rb_interned_str_cstr(const char *ptr)
12519{
12520 return rb_interned_str(ptr, strlen(ptr));
12521}
12522
12523VALUE
12524rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12525{
12526 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12527 rb_enc_autoload(enc);
12528 }
12529
12530 struct RString fake_str;
12531 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12532}
12533
12534VALUE
12535rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12536{
12537 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12538 rb_enc_autoload(enc);
12539 }
12540
12541 struct RString fake_str;
12542 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12543}
12544
12545VALUE
12546rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
12547{
12548 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12549}
12550
12551#if USE_YJIT
12552void
12553rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12554{
12555 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12556 ssize_t code = RB_NUM2SSIZE(codepoint);
12557
12558 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12559 rb_str_buf_cat_byte(str, (char) code);
12560 return;
12561 }
12562 }
12563
12564 rb_str_concat(str, codepoint);
12565}
12566#endif
12567
12568void
12569Init_String(void)
12570{
12571 rb_cString = rb_define_class("String", rb_cObject);
12572 RUBY_ASSERT(rb_vm_fstring_table());
12573 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12575 rb_define_alloc_func(rb_cString, empty_str_alloc);
12576 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12577 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12578 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12579 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12580 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12583 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12584 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12585 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12586 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12589 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12590 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12591 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12592 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12595 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12596 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12597 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12598 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12599 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12601 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12603 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12604 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12605 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12606 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12607 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12608 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12610 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12611 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12612 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12613 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12614 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12615 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12616 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12617 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12619 rb_define_method(rb_cString, "+@", str_uplus, 0);
12620 rb_define_method(rb_cString, "-@", str_uminus, 0);
12621 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12622 rb_define_alias(rb_cString, "dedup", "-@");
12623
12624 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12625 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12626 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12627 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12630 rb_define_method(rb_cString, "undump", str_undump, 0);
12631
12632 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12633 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12634 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12635 sym_fold = ID2SYM(rb_intern_const("fold"));
12636
12637 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12638 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12639 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12640 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12641
12642 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12643 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12644 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12645 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12646
12647 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12648 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12649 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12650 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12651 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12652 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12653 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12654 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12655 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12656 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12657 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12658 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12660 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12661 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12662 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12663 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12664 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12665
12666 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12667 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12668 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12669
12670 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12671
12672 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12673 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12674 rb_define_method(rb_cString, "center", rb_str_center, -1);
12675
12676 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12677 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12678 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12679 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12680 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12681 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12682 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12683 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12684 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12685
12686 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12687 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12688 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12689 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12690 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12691 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12692 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12693 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12694 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12695
12696 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12697 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12698 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12699 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12700 rb_define_method(rb_cString, "count", rb_str_count, -1);
12701
12702 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12703 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12704 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12705 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12706
12707 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12708 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12709 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12710 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12711 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12712
12713 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12714
12715 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12716 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12717
12718 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12719 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12720
12721 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12722 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12723 rb_define_method(rb_cString, "b", rb_str_b, 0);
12724 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12725 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12726
12727 /* define UnicodeNormalize module here so that we don't have to look it up */
12728 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12729 id_normalize = rb_intern_const("normalize");
12730 id_normalized_p = rb_intern_const("normalized?");
12731
12732 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12733 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12734 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12735
12736 rb_fs = Qnil;
12737 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12738 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12739 rb_gc_register_address(&rb_fs);
12740
12741 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12745 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12746
12747 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12748 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12749 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12750 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12751 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12752 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12753
12754 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12755 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12756 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12757 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12758
12759 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12760 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12761 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12762 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12763 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12764 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12765 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12766
12767 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12768 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12769 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12770 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12771
12772 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12773 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12774
12775 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12776}
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2345
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2166
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2635
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:937
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2424
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3877
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2097
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2115
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1272
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3508
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:576
VALUE rb_cSymbol
Symbol class.
Definition string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1260
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3192
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1289
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:904
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1154
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2930
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1173
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12524
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2253
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3615
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1102
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1394
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1295
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:923
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12546
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:788
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1099
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:673
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1835
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1042
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1841
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1892
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4198
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3695
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1905
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1681
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1459
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2404
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3680
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1370
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12154
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2476
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1346
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1675
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2958
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5272
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4049
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3055
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11453
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1752
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1717
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1136
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:958
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1465
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1920
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4035
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3448
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2342
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1938
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6480
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3063
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12518
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1376
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3646
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3005
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4151
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3272
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7201
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2696
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12511
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4105
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3922
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4080
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3622
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3180
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5782
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11511
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1631
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2854
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3152
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3255
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1148
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2652
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7315
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1358
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1647
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2356
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5700
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9408
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1142
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:878
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1951
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:1968
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2960
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1291
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:970
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12478
ID rb_to_id(VALUE str)
Definition string.c:12468
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1844
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3479
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4442
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1388
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2831
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2715
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1382
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2726
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1708
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:197
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1417
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:75
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
Definition string.c:8273
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:300
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113