14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
45#include "ruby_assert.h"
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
61#undef rb_usascii_str_new
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
125#define RUBY_MAX_CHAR_LEN 16
126#define STR_PRECOMPUTED_HASH FL_USER4
127#define STR_SHARED_ROOT FL_USER5
128#define STR_BORROWED FL_USER6
129#define STR_TMPLOCK FL_USER7
130#define STR_NOFREE FL_USER18
131#define STR_FAKESTR FL_USER19
133#define STR_SET_NOEMBED(str) do {\
134 FL_SET((str), STR_NOEMBED);\
135 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
137#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
139#define STR_SET_LEN(str, n) do { \
140 RSTRING(str)->len = (n); \
144str_encindex_fastpath(
int encindex)
148 case ENCINDEX_ASCII_8BIT:
150 case ENCINDEX_US_ASCII:
158str_enc_fastpath(
VALUE str)
163#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
164#define TERM_FILL(ptr, termlen) do {\
165 char *const term_fill_ptr = (ptr);\
166 const int term_fill_len = (termlen);\
167 *term_fill_ptr = '\0';\
168 if (UNLIKELY(term_fill_len > 1))\
169 memset(term_fill_ptr, 0, term_fill_len);\
172#define RESIZE_CAPA(str,capacity) do {\
173 const int termlen = TERM_LEN(str);\
174 RESIZE_CAPA_TERM(str,capacity,termlen);\
176#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
177 if (STR_EMBED_P(str)) {\
178 if (str_embed_capa(str) < capacity + termlen) {\
179 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
180 const long tlen = RSTRING_LEN(str);\
181 memcpy(tmp, RSTRING_PTR(str), tlen);\
182 RSTRING(str)->as.heap.ptr = tmp;\
183 RSTRING(str)->len = tlen;\
184 STR_SET_NOEMBED(str);\
185 RSTRING(str)->as.heap.aux.capa = (capacity);\
189 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
190 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
191 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
192 RSTRING(str)->as.heap.aux.capa = (capacity);\
196#define STR_SET_SHARED(str, shared_str) do { \
197 if (!FL_TEST(str, STR_FAKESTR)) { \
198 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
199 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
200 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
201 FL_SET((str), STR_SHARED); \
202 FL_SET((shared_str), STR_SHARED_ROOT); \
203 if (RBASIC_CLASS((shared_str)) == 0) \
204 FL_SET_RAW((shared_str), STR_BORROWED); \
208#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
209#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
212#define STR_ENC_GET(str) get_encoding(str)
214#if !defined SHARABLE_MIDDLE_SUBSTRING
215# define SHARABLE_MIDDLE_SUBSTRING 0
217#if !SHARABLE_MIDDLE_SUBSTRING
218#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
220#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225str_embed_capa(
VALUE str)
227 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
231rb_str_reembeddable_p(
VALUE str)
233 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
237rb_str_embed_size(
long capa)
243rb_str_size_as_embedded(
VALUE str)
246 if (STR_EMBED_P(str)) {
247 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
251 else if (rb_str_reembeddable_p(str)) {
252 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
255 real_size =
sizeof(
struct RString);
259 real_size +=
sizeof(st_index_t);
266STR_EMBEDDABLE_P(
long len,
long termlen)
268 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
273static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
274static VALUE str_new_static(
VALUE klass,
const char *ptr,
long len,
int encindex);
276static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
277static inline void str_modifiable(
VALUE str);
282str_make_independent(
VALUE str)
284 long len = RSTRING_LEN(str);
285 int termlen = TERM_LEN(str);
286 str_make_independent_expand((str),
len, 0L, termlen);
289static inline int str_dependent_p(
VALUE str);
292rb_str_make_independent(
VALUE str)
294 if (str_dependent_p(str)) {
295 str_make_independent(str);
300rb_str_make_embedded(
VALUE str)
305 char *buf =
RSTRING(str)->as.heap.ptr;
309 STR_SET_LEN(str,
len);
312 memcpy(RSTRING_PTR(str), buf,
len);
316 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
320rb_debug_rstring_null_ptr(
const char *func)
322 fprintf(stderr,
"%s is returning NULL!! "
323 "SIGSEGV is highly expected to follow immediately.\n"
324 "If you could reproduce, attach your debugger here, "
325 "and look at the passed string.\n",
330static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
333get_encoding(
VALUE str)
339mustnot_broken(
VALUE str)
341 if (is_broken_string(str)) {
342 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
347mustnot_wchar(
VALUE str)
349 rb_encoding *enc = STR_ENC_GET(str);
350 if (rb_enc_mbminlen(enc) > 1) {
351 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
357static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
359#if SIZEOF_LONG == SIZEOF_VOIDP
360#define PRECOMPUTED_FAKESTR_HASH 1
364#ifdef PRECOMPUTED_FAKESTR_HASH
366fstring_hash(
VALUE str)
370 return (st_index_t)
RSTRING(str)->as.heap.aux.capa;
377#define fstring_hash rb_str_hash
380const struct st_hash_type rb_fstring_hash_type = {
385#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
387static inline st_index_t
388str_do_hash(
VALUE str)
390 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
392 if (e && !is_ascii_string(str)) {
399str_store_precomputed_hash(
VALUE str, st_index_t hash)
405 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
406 size_t free_bytes = str_embed_capa(str) - used_bytes;
410 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
412 FL_SET(str, STR_PRECOMPUTED_HASH);
420 bool force_precompute_hash;
424fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data,
int existing)
433 if (rb_objspace_garbage_object_p(str)) {
452 long len = RSTRING_LEN(str);
453 long capa =
len +
sizeof(st_index_t);
454 int term_len = TERM_LEN(str);
456 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
458 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
459 STR_SET_LEN(new_str, RSTRING_LEN(str));
461 rb_enc_copy(new_str, str);
462 str_store_precomputed_hash(new_str, fstring_hash(str));
466 rb_enc_copy(new_str, str);
467#ifdef PRECOMPUTED_FAKESTR_HASH
468 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
469 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
483 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
486 if (STR_SHARED_P(str)) {
488 str_make_independent(str);
491 if (!BARE_STRING_P(str)) {
497 RBASIC(str)->flags |= RSTRING_FSTR;
499 *key = *value = arg->fstr = str;
512 if (
FL_TEST(str, RSTRING_FSTR))
515 bare = BARE_STRING_P(str);
517 if (STR_EMBED_P(str)) {
522 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
529 rb_str_resize(str, RSTRING_LEN(str));
531 fstr = register_fstring(str,
false,
false);
534 str_replace_shared_without_enc(str, fstr);
542register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
546 .force_precompute_hash = force_precompute_hash
549#if SIZEOF_VOIDP == SIZEOF_LONG
553 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
559 st_table *frozen_strings = rb_vm_fstring_table();
562 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
563 }
while (UNDEF_P(args.fstr));
576setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
591 return (
VALUE)fake_str;
598rb_setup_fake_str(
struct RString *fake_str,
const char *name,
long len, rb_encoding *enc)
600 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
609rb_fstring_new(
const char *ptr,
long len)
612 return register_fstring(setup_fake_str(&fake_str, ptr,
len, ENCINDEX_US_ASCII),
false,
false);
616rb_fstring_enc_new(
const char *ptr,
long len, rb_encoding *enc)
619 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
false,
false);
623rb_fstring_cstr(
const char *ptr)
625 return rb_fstring_new(ptr, strlen(ptr));
629fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
639 const char *aptr, *bptr;
642 return (alen != blen ||
644 memcmp(aptr, bptr, alen) != 0);
648single_byte_optimizable(
VALUE str)
652 case ENCINDEX_ASCII_8BIT:
653 case ENCINDEX_US_ASCII:
675static inline const char *
676search_nonascii(
const char *p,
const char *e)
678 const uintptr_t *s, *t;
680#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
681# if SIZEOF_UINTPTR_T == 8
682# define NONASCII_MASK UINT64_C(0x8080808080808080)
683# elif SIZEOF_UINTPTR_T == 4
684# define NONASCII_MASK UINT32_C(0x80808080)
686# error "don't know what to do."
689# if SIZEOF_UINTPTR_T == 8
690# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
691# elif SIZEOF_UINTPTR_T == 4
692# define NONASCII_MASK 0x80808080UL
694# error "don't know what to do."
698 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
699#if !UNALIGNED_WORD_ACCESS
700 if ((uintptr_t)p % SIZEOF_VOIDP) {
701 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
706 case 7:
if (p[-7]&0x80)
return p-7;
707 case 6:
if (p[-6]&0x80)
return p-6;
708 case 5:
if (p[-5]&0x80)
return p-5;
709 case 4:
if (p[-4]&0x80)
return p-4;
711 case 3:
if (p[-3]&0x80)
return p-3;
712 case 2:
if (p[-2]&0x80)
return p-2;
713 case 1:
if (p[-1]&0x80)
return p-1;
718#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
719#define aligned_ptr(value) \
720 __builtin_assume_aligned((value), sizeof(uintptr_t))
722#define aligned_ptr(value) (uintptr_t *)(value)
725 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
728 if (*s & NONASCII_MASK) {
729#ifdef WORDS_BIGENDIAN
730 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
732 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
742 case 7:
if (e[-7]&0x80)
return e-7;
743 case 6:
if (e[-6]&0x80)
return e-6;
744 case 5:
if (e[-5]&0x80)
return e-5;
745 case 4:
if (e[-4]&0x80)
return e-4;
747 case 3:
if (e[-3]&0x80)
return e-3;
748 case 2:
if (e[-2]&0x80)
return e-2;
749 case 1:
if (e[-1]&0x80)
return e-1;
755coderange_scan(
const char *p,
long len, rb_encoding *enc)
757 const char *e = p +
len;
759 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
761 p = search_nonascii(p, e);
765 if (rb_enc_asciicompat(enc)) {
766 p = search_nonascii(p, e);
769 int ret = rb_enc_precise_mbclen(p, e, enc);
773 p = search_nonascii(p, e);
779 int ret = rb_enc_precise_mbclen(p, e, enc);
795 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
798 p = search_nonascii(p, e);
802 else if (rb_enc_asciicompat(enc)) {
803 p = search_nonascii(p, e);
809 int ret = rb_enc_precise_mbclen(p, e, enc);
816 p = search_nonascii(p, e);
822 int ret = rb_enc_precise_mbclen(p, e, enc);
847 rb_enc_set_index(str1, rb_enc_get_index(str2));
855rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
860 str_enc_copy(dest, src);
861 if (RSTRING_LEN(dest) == 0) {
862 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
873 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
874 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
885rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
887 str_enc_copy(dest, src);
892enc_coderange_scan(
VALUE str, rb_encoding *enc)
894 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
898rb_enc_str_coderange_scan(
VALUE str, rb_encoding *enc)
900 return enc_coderange_scan(str, enc);
909 cr = enc_coderange_scan(str, get_encoding(str));
916rb_enc_str_asciicompat(
VALUE str)
919 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
927 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
936str_mod_check(
VALUE s,
const char *p,
long len)
938 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
944str_capacity(
VALUE str,
const int termlen)
946 if (STR_EMBED_P(str)) {
947 return str_embed_capa(str) - termlen;
949 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
953 return RSTRING(str)->as.heap.aux.capa;
960 return str_capacity(str, TERM_LEN(str));
964must_not_null(
const char *ptr)
967 rb_raise(rb_eArgError,
"NULL pointer given");
974 size_t size = rb_str_embed_size(
capa);
978 NEWOBJ_OF(str,
struct RString, klass,
985str_alloc_heap(
VALUE klass)
987 NEWOBJ_OF(str,
struct RString, klass,
994empty_str_alloc(
VALUE klass)
996 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
997 VALUE str = str_alloc_embed(klass, 0);
998 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1004str_enc_new(
VALUE klass,
const char *ptr,
long len, rb_encoding *enc)
1009 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1013 enc = rb_ascii8bit_encoding();
1016 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1018 int termlen = rb_enc_mbminlen(enc);
1020 if (STR_EMBEDDABLE_P(
len, termlen)) {
1021 str = str_alloc_embed(klass,
len + termlen);
1027 str = str_alloc_heap(klass);
1033 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1036 rb_enc_raw_set(str, enc);
1039 memcpy(RSTRING_PTR(str), ptr,
len);
1042 STR_SET_LEN(str,
len);
1043 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1048str_new(
VALUE klass,
const char *ptr,
long len)
1050 return str_enc_new(klass, ptr,
len, rb_ascii8bit_encoding());
1062 return str_enc_new(
rb_cString, ptr,
len, rb_usascii_encoding());
1068 return str_enc_new(
rb_cString, ptr,
len, rb_utf8_encoding());
1072rb_enc_str_new(
const char *ptr,
long len, rb_encoding *enc)
1085 __msan_unpoison_string(ptr);
1105 if (rb_enc_mbminlen(enc) != 1) {
1106 rb_raise(rb_eArgError,
"wchar encoding given");
1108 return rb_enc_str_new(ptr, strlen(ptr), enc);
1112str_new_static(
VALUE klass,
const char *ptr,
long len,
int encindex)
1117 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1121 str = str_enc_new(klass, ptr,
len, rb_enc_from_index(encindex));
1124 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1125 str = str_alloc_heap(klass);
1127 RSTRING(str)->as.heap.ptr = (
char *)ptr;
1129 RBASIC(str)->flags |= STR_NOFREE;
1130 rb_enc_associate_index(str, encindex);
1144 return str_new_static(
rb_cString, ptr,
len, ENCINDEX_US_ASCII);
1150 return str_new_static(
rb_cString, ptr,
len, ENCINDEX_UTF_8);
1156 return str_new_static(
rb_cString, ptr,
len, rb_enc_to_index(enc));
1159static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1160 rb_encoding *from, rb_encoding *to,
1161 int ecflags,
VALUE ecopts);
1164is_enc_ascii_string(
VALUE str, rb_encoding *enc)
1166 int encidx = rb_enc_to_index(enc);
1167 if (rb_enc_get_index(str) == encidx)
1168 return is_ascii_string(str);
1179 if (!to)
return str;
1180 if (!from) from = rb_enc_get(str);
1181 if (from == to)
return str;
1182 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1183 rb_is_ascii8bit_enc(to)) {
1184 if (STR_ENC_GET(str) != to) {
1186 rb_enc_associate(str, to);
1193 from, to, ecflags, ecopts);
1194 if (
NIL_P(newstr)) {
1202rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1203 rb_encoding *from,
int ecflags,
VALUE ecopts)
1207 olen = RSTRING_LEN(newstr);
1208 if (ofs < -olen || olen < ofs)
1210 if (ofs < 0) ofs += olen;
1212 STR_SET_LEN(newstr, ofs);
1216 rb_str_modify(newstr);
1217 return str_cat_conv_enc_opts(newstr, ofs, ptr,
len, from,
1223rb_str_initialize(
VALUE str,
const char *ptr,
long len, rb_encoding *enc)
1225 STR_SET_LEN(str, 0);
1226 rb_enc_associate(str, enc);
1232str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1233 rb_encoding *from, rb_encoding *to,
1234 int ecflags,
VALUE ecopts)
1239 VALUE econv_wrapper;
1240 const unsigned char *start, *sp;
1241 unsigned char *dest, *dp;
1242 size_t converted_output = (size_t)ofs;
1247 RBASIC_CLEAR_CLASS(econv_wrapper);
1249 if (!ec)
return Qnil;
1252 sp = (
unsigned char*)ptr;
1254 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1255 (dp = dest + converted_output),
1259 size_t converted_input = sp - start;
1260 size_t rest =
len - converted_input;
1261 converted_output = dp - dest;
1263 if (converted_input && converted_output &&
1264 rest < (LONG_MAX / converted_output)) {
1265 rest = (rest * converted_output) / converted_input;
1270 olen += rest < 2 ? 2 : rest;
1271 rb_str_resize(newstr, olen);
1278 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1280 rb_enc_associate(newstr, to);
1299 const int eidx = rb_enc_to_index(eenc);
1302 return rb_enc_str_new(ptr,
len, eenc);
1306 if ((eidx == rb_ascii8bit_encindex()) ||
1307 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr +
len))) {
1311 ienc = rb_default_internal_encoding();
1312 if (!ienc || eenc == ienc) {
1313 return rb_enc_str_new(ptr,
len, eenc);
1317 if ((eidx == rb_ascii8bit_encindex()) ||
1318 (eidx == rb_usascii_encindex()) ||
1319 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr +
len))) {
1320 return rb_enc_str_new(ptr,
len, ienc);
1323 str = rb_enc_str_new(NULL, 0, ienc);
1326 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr,
len, eenc, 0,
Qnil))) {
1327 rb_str_initialize(str, ptr,
len, eenc);
1333rb_external_str_with_enc(
VALUE str, rb_encoding *eenc)
1335 int eidx = rb_enc_to_index(eenc);
1336 if (eidx == rb_usascii_encindex() &&
1337 !is_ascii_string(str)) {
1338 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1341 rb_enc_associate_index(str, eidx);
1400str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1402 const int termlen = TERM_LEN(str);
1407 if (str_embed_capa(str2) >=
len + termlen) {
1408 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1409 STR_SET_EMBED(str2);
1410 memcpy(ptr2, RSTRING_PTR(str),
len);
1411 TERM_FILL(ptr2+
len, termlen);
1415 if (STR_SHARED_P(str)) {
1416 root =
RSTRING(str)->as.heap.aux.shared;
1425 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1427 rb_fatal(
"about to free a possible shared root");
1429 char *ptr2 = STR_HEAP_PTR(str2);
1431 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1434 FL_SET(str2, STR_NOEMBED);
1435 RSTRING(str2)->as.heap.ptr = ptr;
1436 STR_SET_SHARED(str2, root);
1439 STR_SET_LEN(str2,
len);
1447 str_replace_shared_without_enc(str2, str);
1448 rb_enc_cr_str_exact_copy(str2, str);
1455 return str_replace_shared(str_alloc_heap(klass), str);
1472rb_str_new_frozen_String(
VALUE orig)
1479rb_str_tmp_frozen_acquire(
VALUE orig)
1482 return str_new_frozen_buffer(0, orig, FALSE);
1486rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1488 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1489 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1491 VALUE str = str_alloc_heap(0);
1494 FL_SET(str, STR_SHARED_ROOT);
1496 size_t capa = str_capacity(orig, TERM_LEN(orig));
1502 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1503 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1510 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1511 RBASIC(orig)->flags &= ~STR_NOFREE;
1512 STR_SET_SHARED(orig, str);
1522rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1527 if (STR_EMBED_P(tmp)) {
1536 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1540 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1541 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1546 STR_SET_LEN(tmp, 0);
1554 return str_new_frozen_buffer(klass, orig, TRUE);
1563 VALUE str = str_alloc_heap(klass);
1564 STR_SET_LEN(str, RSTRING_LEN(orig));
1565 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1566 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1567 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1568 RBASIC(orig)->flags &= ~STR_NOFREE;
1569 STR_SET_SHARED(orig, str);
1576str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1580 long len = RSTRING_LEN(orig);
1581 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1582 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1584 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1585 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1591 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1592 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1598 if ((ofs > 0) || (rest > 0) ||
1601 str = str_new_shared(klass,
shared);
1603 RSTRING(str)->as.heap.ptr += ofs;
1604 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1612 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1613 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1615 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1616 STR_SET_LEN(str, RSTRING_LEN(orig));
1621 str = heap_str_make_shared(klass, orig);
1625 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1637str_new_empty_String(
VALUE str)
1640 rb_enc_copy(v, str);
1644#define STR_BUF_MIN_SIZE 63
1649 if (STR_EMBEDDABLE_P(
capa, 1)) {
1657 RSTRING(str)->as.heap.ptr[0] =
'\0';
1666 long len = strlen(ptr);
1677 return str_new(0, 0,
len);
1683 if (STR_EMBED_P(str)) {
1684 RB_DEBUG_COUNTER_INC(obj_str_embed);
1686 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1687 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1688 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1691 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1692 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1697rb_str_memsize(
VALUE str)
1699 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1700 return STR_HEAP_SIZE(str);
1710 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1713static inline void str_discard(
VALUE str);
1714static void str_shared_replace(
VALUE str,
VALUE str2);
1719 if (str != str2) str_shared_replace(str, str2);
1730 enc = STR_ENC_GET(str2);
1733 termlen = rb_enc_mbminlen(enc);
1735 STR_SET_LEN(str, RSTRING_LEN(str2));
1737 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1739 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1740 rb_enc_associate(str, enc);
1744 if (STR_EMBED_P(str2)) {
1746 long len = RSTRING_LEN(str2);
1749 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1750 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1751 RSTRING(str2)->as.heap.ptr = new_ptr;
1752 STR_SET_LEN(str2,
len);
1754 STR_SET_NOEMBED(str2);
1757 STR_SET_NOEMBED(str);
1759 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1761 if (
FL_TEST(str2, STR_SHARED)) {
1763 STR_SET_SHARED(str,
shared);
1766 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1770 STR_SET_EMBED(str2);
1771 RSTRING_PTR(str2)[0] = 0;
1772 STR_SET_LEN(str2, 0);
1773 rb_enc_associate(str, enc);
1787 return rb_obj_as_string_result(str, obj);
1803 len = RSTRING_LEN(str2);
1804 if (STR_SHARED_P(str2)) {
1807 STR_SET_NOEMBED(str);
1808 STR_SET_LEN(str,
len);
1809 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1810 STR_SET_SHARED(str,
shared);
1811 rb_enc_cr_str_exact_copy(str, str2);
1814 str_replace_shared(str, str2);
1823 size_t size = rb_str_embed_size(
capa);
1827 NEWOBJ_OF(str,
struct RString, klass,
1836 NEWOBJ_OF(str,
struct RString, klass,
1847 encidx = rb_enc_get_index(str);
1851 if (encidx) rb_enc_associate_index(dup, encidx);
1861 long len = RSTRING_LEN(str);
1866 STR_SET_LEN(dup, RSTRING_LEN(str));
1867 return str_duplicate_setup_encoding(str, dup, flags);
1876 root =
RSTRING(str)->as.heap.aux.shared;
1878 else if (UNLIKELY(!(flags &
FL_FREEZE))) {
1879 root = str = str_new_frozen(klass, str);
1885 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1886 FL_SET(root, STR_SHARED_ROOT);
1888 flags |= RSTRING_NOEMBED | STR_SHARED;
1890 STR_SET_LEN(dup, RSTRING_LEN(str));
1891 return str_duplicate_setup_encoding(str, dup, flags);
1897 if (STR_EMBED_P(str)) {
1898 return str_duplicate_setup_embed(klass, str, dup);
1901 return str_duplicate_setup_heap(klass, str, dup);
1909 if (STR_EMBED_P(str)) {
1910 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1913 dup = str_alloc_heap(klass);
1916 return str_duplicate_setup(klass, str, dup);
1927rb_str_dup_m(
VALUE str)
1929 if (LIKELY(BARE_STRING_P(str))) {
1940 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1947 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1951 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1952 str_duplicate_setup_embed(klass, str, new_str);
1955 new_str = ec_str_alloc_heap(ec, klass);
1956 str_duplicate_setup_heap(klass, str, new_str);
1965rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
1967 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
1969 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1986 static ID keyword_ids[2];
1987 VALUE orig, opt, venc, vcapa;
1989 rb_encoding *enc = 0;
1992 if (!keyword_ids[0]) {
1993 keyword_ids[0] = rb_id_encoding();
1994 CONST_ID(keyword_ids[1],
"capacity");
2002 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2003 enc = rb_to_encoding(venc);
2005 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2008 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2010 if (
capa < STR_BUF_MIN_SIZE) {
2011 capa = STR_BUF_MIN_SIZE;
2015 len = RSTRING_LEN(orig);
2019 if (orig == str) n = 0;
2021 str_modifiable(str);
2022 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2024 const size_t size = (size_t)
capa + termlen;
2025 const char *
const old_ptr = RSTRING_PTR(str);
2026 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2027 char *new_ptr =
ALLOC_N(
char, size);
2028 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2029 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2031 RSTRING(str)->as.heap.ptr = new_ptr;
2033 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2034 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2035 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2037 STR_SET_LEN(str,
len);
2040 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2041 rb_enc_cr_str_exact_copy(str, orig);
2043 FL_SET(str, STR_NOEMBED);
2050 rb_enc_associate(str, enc);
2062rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2068 static ID keyword_ids[2];
2071 rb_encoding *enc = NULL;
2078 keyword_ids[0] = rb_id_encoding();
2079 CONST_ID(keyword_ids[1],
"capacity");
2081 encoding = kwargs[0];
2082 capacity = kwargs[1];
2091 if (UNDEF_P(encoding)) {
2093 encoding = rb_obj_encoding(orig);
2097 if (!UNDEF_P(encoding)) {
2098 enc = rb_to_encoding(encoding);
2102 if (UNDEF_P(capacity)) {
2104 VALUE empty_str = str_new(klass,
"", 0);
2106 rb_enc_associate(empty_str, enc);
2110 VALUE copy = str_duplicate(klass, orig);
2111 rb_enc_associate(copy, enc);
2124 if (orig_capa >
capa) {
2129 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2130 STR_SET_LEN(str, 0);
2141#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2156static inline uintptr_t
2157count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2162 d = (d>>6) | (~d>>7);
2163 d &= NONASCII_MASK >> 7;
2166#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2168 return rb_popcount_intptr(d);
2172# if SIZEOF_VOIDP == 8
2181enc_strlen(
const char *p,
const char *e, rb_encoding *enc,
int cr)
2187 long diff = (long)(e - p);
2188 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2193 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2194 const uintptr_t *s, *t;
2195 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2196 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2197 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2198 while (p < (
const char *)s) {
2199 if (is_utf8_lead_byte(*p))
len++;
2203 len += count_utf8_lead_bytes_with_word(s);
2206 p = (
const char *)s;
2209 if (is_utf8_lead_byte(*p))
len++;
2215 else if (rb_enc_asciicompat(enc)) {
2220 q = search_nonascii(p, e);
2226 p += rb_enc_fast_mbclen(p, e, enc);
2233 q = search_nonascii(p, e);
2239 p += rb_enc_mbclen(p, e, enc);
2246 for (c=0; p<e; c++) {
2247 p += rb_enc_mbclen(p, e, enc);
2262rb_enc_strlen_cr(
const char *p,
const char *e, rb_encoding *enc,
int *cr)
2270 long diff = (long)(e - p);
2271 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2273 else if (rb_enc_asciicompat(enc)) {
2277 q = search_nonascii(p, e);
2285 ret = rb_enc_precise_mbclen(p, e, enc);
2300 for (c=0; p<e; c++) {
2301 ret = rb_enc_precise_mbclen(p, e, enc);
2308 if (p + rb_enc_mbminlen(enc) <= e)
2309 p += rb_enc_mbminlen(enc);
2320str_strlen(
VALUE str, rb_encoding *enc)
2325 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2326 if (!enc) enc = STR_ENC_GET(str);
2327 p = RSTRING_PTR(str);
2332 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2337 return enc_strlen(p, e, enc, cr);
2344 return str_strlen(str, NULL);
2358 return LONG2NUM(str_strlen(str, NULL));
2370rb_str_bytesize(
VALUE str)
2388rb_str_empty(
VALUE str)
2390 return RBOOL(RSTRING_LEN(str) == 0);
2408 char *ptr1, *ptr2, *ptr3;
2413 enc = rb_enc_check_str(str1, str2);
2416 termlen = rb_enc_mbminlen(enc);
2417 if (len1 > LONG_MAX - len2) {
2418 rb_raise(rb_eArgError,
"string size too big");
2420 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2421 ptr3 = RSTRING_PTR(str3);
2422 memcpy(ptr3, ptr1, len1);
2423 memcpy(ptr3+len1, ptr2, len2);
2424 TERM_FILL(&ptr3[len1+len2], termlen);
2440 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2443 int enc1 = rb_enc_get_index(str1);
2444 int enc2 = rb_enc_get_index(str2);
2449 else if (enc2 < 0) {
2452 else if (enc1 != enc2) {
2455 else if (len1 > LONG_MAX - len2) {
2488 rb_enc_copy(str2, str);
2493 rb_raise(rb_eArgError,
"negative argument");
2495 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2496 if (STR_EMBEDDABLE_P(
len, 1)) {
2498 memset(RSTRING_PTR(str2), 0,
len + 1);
2505 STR_SET_LEN(str2,
len);
2506 rb_enc_copy(str2, str);
2509 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2510 rb_raise(rb_eArgError,
"argument too big");
2513 len *= RSTRING_LEN(str);
2514 termlen = TERM_LEN(str);
2516 ptr2 = RSTRING_PTR(str2);
2518 n = RSTRING_LEN(str);
2519 memcpy(ptr2, RSTRING_PTR(str), n);
2520 while (n <=
len/2) {
2521 memcpy(ptr2 + n, ptr2, n);
2524 memcpy(ptr2 + n, ptr2,
len-n);
2526 STR_SET_LEN(str2,
len);
2527 TERM_FILL(&ptr2[
len], termlen);
2528 rb_enc_cr_str_copy_for_substr(str2, str);
2554 VALUE tmp = rb_check_array_type(arg);
2563rb_check_lockedtmp(
VALUE str)
2565 if (
FL_TEST(str, STR_TMPLOCK)) {
2572#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2574str_modifiable(
VALUE str)
2576 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2577 if (CHILLED_STRING_P(str)) {
2578 CHILLED_STRING_MUTATED(str);
2580 rb_check_lockedtmp(str);
2581 rb_check_frozen(str);
2586str_dependent_p(
VALUE str)
2588 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2598#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2600str_independent(
VALUE str)
2602 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2603 str_modifiable(str);
2604 return !str_dependent_p(str);
2610str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2618 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2619 ptr =
RSTRING(str)->as.heap.ptr;
2623 STR_SET_LEN(str,
len);
2628 oldptr = RSTRING_PTR(str);
2630 memcpy(ptr, oldptr,
len);
2632 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2635 STR_SET_NOEMBED(str);
2636 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2637 TERM_FILL(ptr +
len, termlen);
2638 RSTRING(str)->as.heap.ptr = ptr;
2639 STR_SET_LEN(str,
len);
2646 if (!str_independent(str))
2647 str_make_independent(str);
2654 int termlen = TERM_LEN(str);
2655 long len = RSTRING_LEN(str);
2658 rb_raise(rb_eArgError,
"negative expanding string size");
2660 if (expand >= LONG_MAX -
len) {
2661 rb_raise(rb_eArgError,
"string size too big");
2664 if (!str_independent(str)) {
2665 str_make_independent_expand(str,
len, expand, termlen);
2667 else if (expand > 0) {
2668 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2675str_modify_keep_cr(
VALUE str)
2677 if (!str_independent(str))
2678 str_make_independent(str);
2685str_discard(
VALUE str)
2687 str_modifiable(str);
2688 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2689 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2690 RSTRING(str)->as.heap.ptr = 0;
2691 STR_SET_LEN(str, 0);
2698 int encindex = rb_enc_get_index(str);
2700 if (RB_UNLIKELY(encindex == -1)) {
2704 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2708 rb_encoding *enc = rb_enc_from_index(encindex);
2709 if (!rb_enc_asciicompat(enc)) {
2729 return RSTRING_PTR(str);
2733zero_filled(
const char *s,
int n)
2735 for (; n > 0; --n) {
2742str_null_char(
const char *s,
long len,
const int minlen, rb_encoding *enc)
2744 const char *e = s +
len;
2746 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2747 if (zero_filled(s, minlen))
return s;
2753str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2758 if (str_dependent_p(str)) {
2759 if (!zero_filled(s +
len, termlen))
2760 str_make_independent_expand(str,
len, 0L, termlen);
2763 TERM_FILL(s +
len, termlen);
2766 return RSTRING_PTR(str);
2770rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2772 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2773 long len = RSTRING_LEN(str);
2777 rb_check_lockedtmp(str);
2778 str_make_independent_expand(str,
len, 0L, termlen);
2780 else if (str_dependent_p(str)) {
2781 if (termlen > oldtermlen)
2782 str_make_independent_expand(str,
len, 0L, termlen);
2785 if (!STR_EMBED_P(str)) {
2790 if (termlen > oldtermlen) {
2791 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2799str_null_check(
VALUE str,
int *w)
2801 char *s = RSTRING_PTR(str);
2802 long len = RSTRING_LEN(str);
2803 rb_encoding *enc = rb_enc_get(str);
2804 const int minlen = rb_enc_mbminlen(enc);
2808 if (str_null_char(s,
len, minlen, enc)) {
2811 return str_fill_term(str, s,
len, minlen);
2814 if (!s || memchr(s, 0,
len)) {
2818 s = str_fill_term(str, s,
len, minlen);
2824rb_str_to_cstr(
VALUE str)
2827 return str_null_check(str, &w);
2835 char *s = str_null_check(str, &w);
2838 rb_raise(rb_eArgError,
"string contains null char");
2840 rb_raise(rb_eArgError,
"string contains null byte");
2846rb_str_fill_terminator(
VALUE str,
const int newminlen)
2848 char *s = RSTRING_PTR(str);
2849 long len = RSTRING_LEN(str);
2850 return str_fill_term(str, s,
len, newminlen);
2856 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2880str_nth_len(
const char *p,
const char *e,
long *nthp, rb_encoding *enc)
2889 else if (rb_enc_asciicompat(enc)) {
2890 const char *p2, *e2;
2893 while (p < e && 0 < nth) {
2900 p2 = search_nonascii(p, e2);
2909 n = rb_enc_mbclen(p, e, enc);
2920 while (p < e && nth--) {
2921 p += rb_enc_mbclen(p, e, enc);
2930rb_enc_nth(
const char *p,
const char *e,
long nth, rb_encoding *enc)
2932 return str_nth_len(p, e, &nth, enc);
2936str_nth(
const char *p,
const char *e,
long nth, rb_encoding *enc,
int singlebyte)
2941 p = str_nth_len(p, e, &nth, enc);
2950str_offset(
const char *p,
const char *e,
long nth, rb_encoding *enc,
int singlebyte)
2952 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2953 if (!pp)
return e - p;
2960 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
2961 STR_ENC_GET(str), single_byte_optimizable(str));
2966str_utf8_nth(
const char *p,
const char *e,
long *nthp)
2969 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
2970 const uintptr_t *s, *t;
2971 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2972 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2973 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2974 while (p < (
const char *)s) {
2975 if (is_utf8_lead_byte(*p)) nth--;
2979 nth -= count_utf8_lead_bytes_with_word(s);
2981 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
2985 if (is_utf8_lead_byte(*p)) {
2986 if (nth == 0)
break;
2996str_utf8_offset(
const char *p,
const char *e,
long nth)
2998 const char *pp = str_utf8_nth(p, e, &nth);
3007 if (single_byte_optimizable(str) || pos < 0)
3010 char *p = RSTRING_PTR(str);
3011 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3016str_subseq(
VALUE str,
long beg,
long len)
3024 const int termlen = TERM_LEN(str);
3025 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3032 if (str_embed_capa(str2) >=
len + termlen) {
3033 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3034 STR_SET_EMBED(str2);
3035 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3036 TERM_FILL(ptr2+
len, termlen);
3038 STR_SET_LEN(str2,
len);
3042 str_replace_shared(str2, str);
3045 RSTRING(str2)->as.heap.ptr += beg;
3046 if (RSTRING_LEN(str2) >
len) {
3047 STR_SET_LEN(str2,
len);
3057 VALUE str2 = str_subseq(str, beg,
len);
3058 rb_enc_cr_str_copy_for_substr(str2, str);
3067 const long blen = RSTRING_LEN(str);
3068 rb_encoding *enc = STR_ENC_GET(str);
3069 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3071 if (
len < 0)
return 0;
3072 if (beg < 0 && -beg < 0)
return 0;
3076 if (single_byte_optimizable(str)) {
3077 if (beg > blen)
return 0;
3080 if (beg < 0)
return 0;
3082 if (
len > blen - beg)
3084 if (
len < 0)
return 0;
3089 if (
len > -beg)
len = -beg;
3093 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3096 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3102 slen = str_strlen(str, enc);
3104 if (beg < 0)
return 0;
3106 if (
len == 0)
goto end;
3109 else if (beg > 0 && beg > blen) {
3113 if (beg > str_strlen(str, enc))
return 0;
3118 enc == rb_utf8_encoding()) {
3119 p = str_utf8_nth(s, e, &beg);
3120 if (beg > 0)
return 0;
3121 len = str_utf8_offset(p, e,
len);
3127 p = s + beg * char_sz;
3131 else if (
len * char_sz > e - p)
3136 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3137 if (beg > 0)
return 0;
3141 len = str_offset(p, e,
len, enc, 0);
3149static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3154 return str_substr(str, beg,
len, TRUE);
3164str_substr(
VALUE str,
long beg,
long len,
int empty)
3168 if (!p)
return Qnil;
3169 if (!
len && !empty)
return Qnil;
3171 beg = p - RSTRING_PTR(str);
3173 VALUE str2 = str_subseq(str, beg,
len);
3174 rb_enc_cr_str_copy_for_substr(str2, str);
3182 if (CHILLED_STRING_P(str)) {
3187 rb_str_resize(str, RSTRING_LEN(str));
3203 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3233str_uminus(
VALUE str)
3238 return rb_fstring(str);
3242#define rb_str_dup_frozen rb_str_new_frozen
3247 if (
FL_TEST(str, STR_TMPLOCK)) {
3250 FL_SET(str, STR_TMPLOCK);
3257 if (!
FL_TEST(str, STR_TMPLOCK)) {
3275 const int termlen = TERM_LEN(str);
3277 str_modifiable(str);
3278 if (STR_SHARED_P(str)) {
3281 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3282 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3293 else if (
len > RSTRING_LEN(str)) {
3297 const char *
const new_end = RSTRING_PTR(str) +
len;
3298 rb_encoding *enc = rb_enc_get(str);
3307 else if (
len < RSTRING_LEN(str)) {
3315 STR_SET_LEN(str,
len);
3316 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3323 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3326 int independent = str_independent(str);
3327 long slen = RSTRING_LEN(str);
3328 const int termlen = TERM_LEN(str);
3330 if (slen >
len || (termlen != 1 && slen <
len)) {
3336 if (STR_EMBED_P(str)) {
3337 if (
len == slen)
return str;
3338 if (str_embed_capa(str) >=
len + termlen) {
3339 STR_SET_LEN(str,
len);
3343 str_make_independent_expand(str, slen,
len - slen, termlen);
3345 else if (str_embed_capa(str) >=
len + termlen) {
3346 char *ptr = STR_HEAP_PTR(str);
3348 if (slen >
len) slen =
len;
3351 STR_SET_LEN(str,
len);
3352 if (independent) ruby_xfree(ptr);
3355 else if (!independent) {
3356 if (
len == slen)
return str;
3357 str_make_independent_expand(str, slen,
len - slen, termlen);
3361 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3362 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3365 else if (
len == slen)
return str;
3366 STR_SET_LEN(str,
len);
3373str_ensure_available_capa(
VALUE str,
long len)
3375 str_modify_keep_cr(str);
3377 const int termlen = TERM_LEN(str);
3378 long olen = RSTRING_LEN(str);
3380 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3381 rb_raise(rb_eArgError,
"string sizes too big");
3384 long total = olen +
len;
3385 long capa = str_capacity(str, termlen);
3388 if (total >= LONG_MAX / 2) {
3391 while (total >
capa) {
3394 RESIZE_CAPA_TERM(str,
capa, termlen);
3399str_buf_cat4(
VALUE str,
const char *ptr,
long len,
bool keep_cr)
3402 str_modify_keep_cr(str);
3407 if (
len == 0)
return 0;
3409 long total, olen,
off = -1;
3411 const int termlen = TERM_LEN(str);
3414 if (ptr >= sptr && ptr <= sptr + olen) {
3418 long capa = str_capacity(str, termlen);
3420 if (olen > LONG_MAX -
len) {
3421 rb_raise(rb_eArgError,
"string sizes too big");
3425 if (total >= LONG_MAX / 2) {
3428 while (total >
capa) {
3431 RESIZE_CAPA_TERM(str,
capa, termlen);
3432 sptr = RSTRING_PTR(str);
3437 memcpy(sptr + olen, ptr,
len);
3438 STR_SET_LEN(str, total);
3439 TERM_FILL(sptr + total, termlen);
3444#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3445#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3450 if (
len == 0)
return str;
3452 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3454 return str_buf_cat(str, ptr,
len);
3465rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3470 if (UNLIKELY(!str_independent(str))) {
3471 str_make_independent(str);
3474 long string_length = -1;
3475 const int null_terminator_length = 1;
3480 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3481 rb_raise(rb_eArgError,
"string sizes too big");
3484 long string_capacity = str_capacity(str, null_terminator_length);
3490 if (LIKELY(string_capacity >= string_length + 1)) {
3492 sptr[string_length] = byte;
3493 STR_SET_LEN(str, string_length + 1);
3494 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3498 str_buf_cat(str, (
char *)&
byte, 1);
3514 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3525rb_enc_cr_str_buf_cat(
VALUE str,
const char *ptr,
long len,
3526 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3531 rb_encoding *str_enc, *ptr_enc;
3535 if (str_encindex == ptr_encindex) {
3537 ptr_cr = coderange_scan(ptr,
len, rb_enc_from_index(ptr_encindex));
3541 str_enc = rb_enc_from_index(str_encindex);
3542 ptr_enc = rb_enc_from_index(ptr_encindex);
3543 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3546 if (RSTRING_LEN(str) == 0) {
3549 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3555 ptr_cr = coderange_scan(ptr,
len, ptr_enc);
3564 *ptr_cr_ret = ptr_cr;
3566 if (str_encindex != ptr_encindex &&
3569 str_enc = rb_enc_from_index(str_encindex);
3570 ptr_enc = rb_enc_from_index(ptr_encindex);
3575 res_encindex = str_encindex;
3580 res_encindex = str_encindex;
3584 res_encindex = ptr_encindex;
3589 res_encindex = str_encindex;
3596 res_encindex = str_encindex;
3602 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3604 str_buf_cat(str, ptr,
len);
3610 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3617 return rb_enc_cr_str_buf_cat(str, ptr,
len,
3626 rb_encoding *enc = rb_enc_from_index(encindex);
3627 if (rb_enc_asciicompat(enc)) {
3628 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3634 unsigned int c = (
unsigned char)*ptr;
3635 int len = rb_enc_codelen(c, enc);
3636 rb_enc_mbcput(c, buf, enc);
3637 rb_enc_cr_str_buf_cat(str, buf,
len,
3650 if (str_enc_fastpath(str)) {
3654 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3660 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3671 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3687rb_str_concat_literals(
size_t num,
const VALUE *strary)
3691 unsigned long len = 1;
3696 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3698 str_enc_copy_direct(str, strary[0]);
3700 for (i = s; i < num; ++i) {
3701 const VALUE v = strary[i];
3705 if (encidx != ENCINDEX_US_ASCII) {
3707 rb_enc_set_index(str, encidx);
3732rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3734 str_modifiable(str);
3739 else if (argc > 1) {
3742 rb_enc_copy(arg_str, str);
3743 for (i = 0; i < argc; i++) {
3776rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3778 long needed_capacity = 0;
3782 for (
int index = 0; index < argc; index++) {
3783 VALUE obj = argv[index];
3791 needed_capacity += RSTRING_LEN(obj);
3796 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3803 str_ensure_available_capa(str, needed_capacity);
3806 for (
int index = 0; index < argc; index++) {
3807 VALUE obj = argv[index];
3812 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3813 char byte = (char)(
NUM2INT(obj) & 0xFF);
3822 memcpy(sptr, ptr,
len);
3827 rb_bug(
"append_as_bytes arguments should have been validated");
3831 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3832 TERM_FILL(sptr, TERM_LEN(str));
3837 for (
int index = 0; index < argc; index++) {
3838 VALUE obj = argv[index];
3855 rb_bug(
"append_as_bytes arguments should have been validated");
3925 rb_encoding *enc = STR_ENC_GET(str1);
3929 if (rb_num_to_uint(str2, &code) == 0) {
3942 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3945 rb_str_buf_cat_byte(str1, (
unsigned char)code);
3948 long pos = RSTRING_LEN(str1);
3953 switch (
len = rb_enc_codelen(code, enc)) {
3954 case ONIGERR_INVALID_CODE_POINT_VALUE:
3955 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3957 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3963 rb_enc_mbcput(code, buf, enc);
3964 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
3965 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3967 rb_str_resize(str1, pos+
len);
3968 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
3981rb_ascii8bit_appendable_encoding_index(rb_encoding *enc,
unsigned int code)
3983 int encidx = rb_enc_to_index(enc);
3985 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3990 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3991 return ENCINDEX_ASCII_8BIT;
4014rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4016 str_modifiable(str);
4021 else if (argc > 1) {
4024 rb_enc_copy(arg_str, str);
4025 for (i = 0; i < argc; i++) {
4038 st_index_t precomputed_hash;
4039 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4041 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4042 return precomputed_hash;
4045 return str_do_hash(str);
4052 const char *ptr1, *ptr2;
4055 return (len1 != len2 ||
4057 memcmp(ptr1, ptr2, len1) != 0);
4071rb_str_hash_m(
VALUE str)
4077#define lesser(a,b) (((a)>(b))?(b):(a))
4085 if (RSTRING_LEN(str1) == 0)
return TRUE;
4086 if (RSTRING_LEN(str2) == 0)
return TRUE;
4089 if (idx1 == idx2)
return TRUE;
4094 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4098 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4108 const char *ptr1, *ptr2;
4111 if (str1 == str2)
return 0;
4114 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4123 if (len1 > len2)
return 1;
4126 if (retval > 0)
return 1;
4153 if (str1 == str2)
return Qtrue;
4160 return rb_str_eql_internal(str1, str2);
4184 if (str1 == str2)
return Qtrue;
4186 return rb_str_eql_internal(str1, str2);
4217 return rb_invcmp(str1, str2);
4259 return str_casecmp(str1, s);
4267 const char *p1, *p1end, *p2, *p2end;
4269 enc = rb_enc_compatible(str1, str2);
4274 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4275 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4276 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4277 while (p1 < p1end && p2 < p2end) {
4279 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4280 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4282 return INT2FIX(c1 < c2 ? -1 : 1);
4289 while (p1 < p1end && p2 < p2end) {
4290 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4291 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4293 if (0 <= c1 && 0 <= c2) {
4297 return INT2FIX(c1 < c2 ? -1 : 1);
4301 l1 = rb_enc_mbclen(p1, p1end, enc);
4302 l2 = rb_enc_mbclen(p2, p2end, enc);
4303 len = l1 < l2 ? l1 : l2;
4304 r = memcmp(p1, p2,
len);
4306 return INT2FIX(r < 0 ? -1 : 1);
4308 return INT2FIX(l1 < l2 ? -1 : 1);
4314 if (RSTRING_LEN(str1) == RSTRING_LEN(str2))
return INT2FIX(0);
4315 if (RSTRING_LEN(str1) > RSTRING_LEN(str2))
return INT2FIX(1);
4349 return str_casecmp_p(str1, s);
4356 VALUE folded_str1, folded_str2;
4357 VALUE fold_opt = sym_fold;
4359 enc = rb_enc_compatible(str1, str2);
4364 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4365 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4367 return rb_str_eql(folded_str1, folded_str2);
4371strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4372 const char *sub_ptr,
long sub_len,
long offset, rb_encoding *enc)
4374 const char *search_start = str_ptr;
4375 long pos, search_len = str_len - offset;
4379 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4380 if (pos < 0)
return pos;
4382 if (t == search_start + pos)
break;
4383 search_len -= t - search_start;
4384 if (search_len <= 0)
return -1;
4385 offset += t - search_start;
4388 return pos + offset;
4392#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4393#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4396rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4398 const char *str_ptr, *str_ptr_end, *sub_ptr;
4399 long str_len, sub_len;
4402 enc = rb_enc_check(str, sub);
4403 if (is_broken_string(sub))
return -1;
4405 str_ptr = RSTRING_PTR(str);
4407 str_len = RSTRING_LEN(str);
4408 sub_ptr = RSTRING_PTR(sub);
4409 sub_len = RSTRING_LEN(sub);
4411 if (str_len < sub_len)
return -1;
4414 long str_len_char, sub_len_char;
4415 int single_byte = single_byte_optimizable(str);
4416 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4417 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4419 offset += str_len_char;
4420 if (offset < 0)
return -1;
4422 if (str_len_char - offset < sub_len_char)
return -1;
4423 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4426 if (sub_len == 0)
return offset;
4429 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4443rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4447 rb_encoding *enc = STR_ENC_GET(str);
4450 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4451 long slen = str_strlen(str, enc);
4453 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4465 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4466 enc, single_byte_optimizable(str));
4477 pos = rb_str_index(str, sub, pos);
4491str_ensure_byte_pos(
VALUE str,
long pos)
4493 if (!single_byte_optimizable(str)) {
4494 const char *s = RSTRING_PTR(str);
4496 const char *p = s + pos;
4497 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4499 "offset %ld does not land on character boundary", pos);
4546rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4552 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4553 long slen = RSTRING_LEN(str);
4555 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4566 str_ensure_byte_pos(str, pos);
4578 pos = rb_str_byteindex(str, sub, pos);
4579 if (pos >= 0)
return LONG2NUM(pos);
4586memrchr(
const char *search_str,
int chr,
long search_len)
4588 const char *ptr = search_str + search_len;
4589 while (ptr > search_str) {
4590 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4598str_rindex(
VALUE str,
VALUE sub,
const char *s, rb_encoding *enc)
4600 char *hit, *adjusted;
4602 long slen, searchlen;
4605 sbeg = RSTRING_PTR(str);
4606 slen = RSTRING_LEN(sub);
4607 if (slen == 0)
return s - sbeg;
4609 t = RSTRING_PTR(sub);
4611 searchlen = s - sbeg + 1;
4613 if (memcmp(s, t, slen) == 0) {
4618 hit = memrchr(sbeg, c, searchlen);
4621 if (hit != adjusted) {
4622 searchlen = adjusted - sbeg;
4625 if (memcmp(hit, t, slen) == 0)
4627 searchlen = adjusted - sbeg;
4628 }
while (searchlen > 0);
4642 enc = rb_enc_check(str, sub);
4643 if (is_broken_string(sub))
return -1;
4644 singlebyte = single_byte_optimizable(str);
4645 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4646 slen = str_strlen(sub, enc);
4649 if (
len < slen)
return -1;
4650 if (
len - pos < slen) pos =
len - slen;
4651 if (
len == 0)
return pos;
4653 sbeg = RSTRING_PTR(str);
4656 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4662 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4663 return str_rindex(str, sub, s, enc);
4724rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4728 rb_encoding *enc = STR_ENC_GET(str);
4729 long pos,
len = str_strlen(str, enc);
4731 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4733 if (pos < 0 && (pos +=
len) < 0) {
4739 if (pos >
len) pos =
len;
4747 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4748 enc, single_byte_optimizable(str));
4759 pos = rb_str_rindex(str, sub, pos);
4769rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4775 enc = rb_enc_check(str, sub);
4776 if (is_broken_string(sub))
return -1;
4777 len = RSTRING_LEN(str);
4778 slen = RSTRING_LEN(sub);
4781 if (
len < slen)
return -1;
4782 if (
len - pos < slen) pos =
len - slen;
4783 if (
len == 0)
return pos;
4785 sbeg = RSTRING_PTR(str);
4788 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4795 return str_rindex(str, sub, s, enc);
4860rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4864 long pos,
len = RSTRING_LEN(str);
4866 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4868 if (pos < 0 && (pos +=
len) < 0) {
4874 if (pos >
len) pos =
len;
4880 str_ensure_byte_pos(str, pos);
4892 pos = rb_str_byterindex(str, sub, pos);
4893 if (pos >= 0)
return LONG2NUM(pos);
4929 switch (OBJ_BUILTIN_TYPE(y)) {
4981rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
4988 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5020rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5024 re = get_pat(argv[0]);
5025 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5034static enum neighbor_char
5035enc_succ_char(
char *p,
long len, rb_encoding *enc)
5040 if (rb_enc_mbminlen(enc) > 1) {
5042 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5044 return NEIGHBOR_NOT_CHAR;
5046 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5048 if (!l)
return NEIGHBOR_NOT_CHAR;
5049 if (l !=
len)
return NEIGHBOR_WRAPPED;
5050 rb_enc_mbcput(c, p, enc);
5051 r = rb_enc_precise_mbclen(p, p +
len, enc);
5053 return NEIGHBOR_NOT_CHAR;
5055 return NEIGHBOR_FOUND;
5058 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5061 return NEIGHBOR_WRAPPED;
5062 ++((
unsigned char*)p)[i];
5063 l = rb_enc_precise_mbclen(p, p+
len, enc);
5067 return NEIGHBOR_FOUND;
5070 memset(p+l, 0xff,
len-l);
5076 for (len2 =
len-1; 0 < len2; len2--) {
5077 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5081 memset(p+len2+1, 0xff,
len-(len2+1));
5086static enum neighbor_char
5087enc_pred_char(
char *p,
long len, rb_encoding *enc)
5091 if (rb_enc_mbminlen(enc) > 1) {
5093 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5095 return NEIGHBOR_NOT_CHAR;
5097 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5098 if (!c)
return NEIGHBOR_NOT_CHAR;
5101 if (!l)
return NEIGHBOR_NOT_CHAR;
5102 if (l !=
len)
return NEIGHBOR_WRAPPED;
5103 rb_enc_mbcput(c, p, enc);
5104 r = rb_enc_precise_mbclen(p, p +
len, enc);
5106 return NEIGHBOR_NOT_CHAR;
5108 return NEIGHBOR_FOUND;
5111 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5114 return NEIGHBOR_WRAPPED;
5115 --((
unsigned char*)p)[i];
5116 l = rb_enc_precise_mbclen(p, p+
len, enc);
5120 return NEIGHBOR_FOUND;
5123 memset(p+l, 0,
len-l);
5129 for (len2 =
len-1; 0 < len2; len2--) {
5130 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5134 memset(p+len2+1, 0,
len-(len2+1));
5148static enum neighbor_char
5149enc_succ_alnum_char(
char *p,
long len, rb_encoding *enc,
char *carry)
5151 enum neighbor_char ret;
5155 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5159 const int max_gaps = 1;
5161 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5163 ctype = ONIGENC_CTYPE_DIGIT;
5165 ctype = ONIGENC_CTYPE_ALPHA;
5167 return NEIGHBOR_NOT_CHAR;
5170 for (
try = 0;
try <= max_gaps; ++
try) {
5171 ret = enc_succ_char(p,
len, enc);
5172 if (ret == NEIGHBOR_FOUND) {
5173 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5175 return NEIGHBOR_FOUND;
5182 ret = enc_pred_char(p,
len, enc);
5183 if (ret == NEIGHBOR_FOUND) {
5184 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5197 return NEIGHBOR_NOT_CHAR;
5200 if (ctype != ONIGENC_CTYPE_DIGIT) {
5202 return NEIGHBOR_WRAPPED;
5206 enc_succ_char(carry,
len, enc);
5207 return NEIGHBOR_WRAPPED;
5275 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5276 rb_enc_cr_str_copy_for_substr(str, orig);
5277 return str_succ(str);
5284 char *sbeg, *s, *e, *last_alnum = 0;
5285 int found_alnum = 0;
5287 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5288 long carry_pos = 0, carry_len = 1;
5289 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5291 slen = RSTRING_LEN(str);
5292 if (slen == 0)
return str;
5294 enc = STR_ENC_GET(str);
5295 sbeg = RSTRING_PTR(str);
5296 s = e = sbeg + slen;
5298 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5299 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5305 l = rb_enc_precise_mbclen(s, e, enc);
5306 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5307 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5308 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5310 case NEIGHBOR_NOT_CHAR:
5312 case NEIGHBOR_FOUND:
5314 case NEIGHBOR_WRAPPED:
5319 carry_pos = s - sbeg;
5324 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5325 enum neighbor_char neighbor;
5326 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5327 l = rb_enc_precise_mbclen(s, e, enc);
5328 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5329 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5331 neighbor = enc_succ_char(tmp, l, enc);
5333 case NEIGHBOR_FOUND:
5337 case NEIGHBOR_WRAPPED:
5340 case NEIGHBOR_NOT_CHAR:
5343 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5345 enc_succ_char(s, l, enc);
5347 if (!rb_enc_asciicompat(enc)) {
5348 MEMCPY(carry, s,
char, l);
5351 carry_pos = s - sbeg;
5355 RESIZE_CAPA(str, slen + carry_len);
5356 sbeg = RSTRING_PTR(str);
5357 s = sbeg + carry_pos;
5358 memmove(s + carry_len, s, slen - carry_pos);
5359 memmove(s, carry, carry_len);
5361 STR_SET_LEN(str, slen);
5362 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5376rb_str_succ_bang(
VALUE str)
5384all_digits_p(
const char *s,
long len)
5438 VALUE end, exclusive;
5442 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5448 VALUE current, after_end;
5455 enc = rb_enc_check(beg, end);
5456 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5458 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5459 char c = RSTRING_PTR(beg)[0];
5460 char e = RSTRING_PTR(end)[0];
5462 if (c > e || (excl && c == e))
return beg;
5464 VALUE str = rb_enc_str_new(&c, 1, enc);
5466 if ((*each)(str, arg))
break;
5467 if (!excl && c == e)
break;
5469 if (excl && c == e)
break;
5474 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5475 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5476 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5481 b = rb_str_to_inum(beg, 10, FALSE);
5482 e = rb_str_to_inum(end, 10, FALSE);
5486 rb_encoding *usascii = rb_usascii_encoding();
5489 if (excl && bi == ei)
break;
5490 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5495 ID op = excl ?
'<' : idLE;
5496 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5501 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5502 b = rb_funcallv(b, succ, 0, 0);
5509 if (n > 0 || (excl && n == 0))
return beg;
5511 after_end = rb_funcallv(end, succ, 0, 0);
5516 next = rb_funcallv(current, succ, 0, 0);
5517 if ((*each)(current, arg))
break;
5518 if (
NIL_P(next))
break;
5522 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5537 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5538 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5539 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5541 b = rb_str_to_inum(beg, 10, FALSE);
5544 rb_encoding *usascii = rb_usascii_encoding();
5547 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5555 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5556 b = rb_funcallv(b, succ, 0, 0);
5562 VALUE next = rb_funcallv(current, succ, 0, 0);
5563 if ((*each)(current, arg))
break;
5566 if (RSTRING_LEN(current) == 0)
5577 if (!
rb_equal(str, *argp))
return 0;
5591 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5592 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5593 rb_enc_asciicompat(STR_ENC_GET(val))) {
5594 const char *bp = RSTRING_PTR(beg);
5595 const char *ep = RSTRING_PTR(end);
5596 const char *vp = RSTRING_PTR(val);
5597 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5598 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5606 if (b <= v && v < e)
return Qtrue;
5607 return RBOOL(!
RTEST(exclusive) && v == e);
5614 all_digits_p(bp, RSTRING_LEN(beg)) &&
5615 all_digits_p(ep, RSTRING_LEN(end))) {
5620 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5622 return RBOOL(
NIL_P(val));
5645 return rb_str_subpat(str, indx,
INT2FIX(0));
5648 if (rb_str_index(str, indx, 0) != -1)
5654 long beg,
len = str_strlen(str, NULL);
5666 return str_substr(str, idx, 1, FALSE);
5685rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5689 return rb_str_subpat(str, argv[0], argv[1]);
5692 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5696 return rb_str_aref(str, argv[0]);
5702 char *ptr = RSTRING_PTR(str);
5703 long olen = RSTRING_LEN(str), nlen;
5705 str_modifiable(str);
5706 if (
len > olen)
len = olen;
5708 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5710 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5712 ptr =
RSTRING(str)->as.embed.ary;
5713 memmove(ptr, oldptr +
len, nlen);
5714 if (fl == STR_NOEMBED)
xfree(oldptr);
5717 if (!STR_SHARED_P(str)) {
5719 rb_enc_cr_str_exact_copy(shared, str);
5724 STR_SET_LEN(str, nlen);
5726 if (!SHARABLE_MIDDLE_SUBSTRING) {
5727 TERM_FILL(ptr + nlen, TERM_LEN(str));
5734rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5740 if (beg == 0 && vlen == 0) {
5745 str_modify_keep_cr(str);
5749 RESIZE_CAPA(str, slen + vlen -
len);
5750 sptr = RSTRING_PTR(str);
5759 memmove(sptr + beg + vlen,
5761 slen - (beg +
len));
5763 if (vlen < beg &&
len < 0) {
5767 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5770 STR_SET_LEN(str, slen);
5771 TERM_FILL(&sptr[slen], TERM_LEN(str));
5778 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5787 int singlebyte = single_byte_optimizable(str);
5793 enc = rb_enc_check(str, val);
5794 slen = str_strlen(str, enc);
5796 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5805 if (
len > slen - beg) {
5808 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5813 beg = p - RSTRING_PTR(str);
5815 rb_str_update_0(str, beg,
len, val);
5816 rb_enc_associate(str, enc);
5827 long start, end,
len;
5837 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5841 nth += regs->num_regs;
5851 enc = rb_enc_check_str(str, val);
5852 rb_str_update_0(str, start,
len, val);
5853 rb_enc_associate(str, enc);
5861 switch (
TYPE(indx)) {
5863 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5867 beg = rb_str_index(str, indx, 0);
5921rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5925 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5933 return rb_str_aset(str, argv[0], argv[1]);
5993rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6001 str_modify_keep_cr(str);
6009 if ((nth += regs->num_regs) <= 0)
return Qnil;
6011 else if (nth >= regs->num_regs)
return Qnil;
6013 len = END(nth) - beg;
6016 else if (argc == 2) {
6025 beg = p - RSTRING_PTR(str);
6029 beg = rb_str_index(str, indx, 0);
6030 if (beg == -1)
return Qnil;
6031 len = RSTRING_LEN(indx);
6043 beg = p - RSTRING_PTR(str);
6052 beg = p - RSTRING_PTR(str);
6056 rb_enc_cr_str_copy_for_substr(result, str);
6064 char *sptr = RSTRING_PTR(str);
6065 long slen = RSTRING_LEN(str);
6066 if (beg +
len > slen)
6070 slen - (beg +
len));
6072 STR_SET_LEN(str, slen);
6073 TERM_FILL(&sptr[slen], TERM_LEN(str));
6084 switch (OBJ_BUILTIN_TYPE(pat)) {
6103get_pat_quoted(
VALUE pat,
int check)
6107 switch (OBJ_BUILTIN_TYPE(pat)) {
6121 if (check && is_broken_string(pat)) {
6122 rb_exc_raise(rb_reg_check_preprocess(pat));
6128rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6131 pos = rb_str_byteindex(str, pat, pos);
6132 if (set_backref_str) {
6134 str = rb_str_new_frozen_String(str);
6135 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6144 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6164rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6178 hash = rb_check_hash_type(argv[1]);
6184 pat = get_pat_quoted(argv[0], 1);
6186 str_modifiable(str);
6187 beg = rb_pat_search(pat, str, 0, 1);
6201 end0 = beg0 + RSTRING_LEN(pat);
6210 if (iter || !
NIL_P(hash)) {
6211 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6214 repl = rb_obj_as_string(
rb_yield(match0));
6217 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6218 repl = rb_obj_as_string(repl);
6220 str_mod_check(str, p,
len);
6221 rb_check_frozen(str);
6227 enc = rb_enc_compatible(str, repl);
6229 rb_encoding *str_enc = STR_ENC_GET(str);
6230 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6234 rb_enc_inspect_name(str_enc),
6235 rb_enc_inspect_name(STR_ENC_GET(repl)));
6237 enc = STR_ENC_GET(repl);
6240 rb_enc_associate(str, enc);
6250 rlen = RSTRING_LEN(repl);
6251 len = RSTRING_LEN(str);
6253 RESIZE_CAPA(str,
len + rlen - plen);
6255 p = RSTRING_PTR(str);
6257 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6259 rp = RSTRING_PTR(repl);
6260 memmove(p + beg0, rp, rlen);
6262 STR_SET_LEN(str,
len);
6263 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6292 rb_str_sub_bang(argc, argv, str);
6297str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6300 long beg, beg0, end0;
6301 long offset, blen, slen,
len, last;
6302 enum {STR, ITER, MAP} mode = STR;
6304 int need_backref = -1;
6305 rb_encoding *str_enc;
6314 hash = rb_check_hash_type(argv[1]);
6323 rb_error_arity(argc, 1, 2);
6326 pat = get_pat_quoted(argv[0], 1);
6327 beg = rb_pat_search(pat, str, 0, need_backref);
6329 if (bang)
return Qnil;
6334 blen = RSTRING_LEN(str) + 30;
6336 sp = RSTRING_PTR(str);
6337 slen = RSTRING_LEN(str);
6339 str_enc = STR_ENC_GET(str);
6340 rb_enc_associate(dest, str_enc);
6348 end0 = beg0 + RSTRING_LEN(pat);
6359 val = rb_obj_as_string(
rb_yield(match0));
6362 val = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6363 val = rb_obj_as_string(val);
6365 str_mod_check(str, sp, slen);
6370 else if (need_backref) {
6372 if (need_backref < 0) {
6373 need_backref = val != repl;
6380 len = beg0 - offset;
6394 if (RSTRING_LEN(str) <= end0)
break;
6395 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6397 offset = end0 +
len;
6399 cp = RSTRING_PTR(str) + offset;
6400 if (offset > RSTRING_LEN(str))
break;
6401 beg = rb_pat_search(pat, str, offset, need_backref);
6405 if (RSTRING_LEN(str) > offset) {
6408 rb_pat_search(pat, str, last, 1);
6410 str_shared_replace(str, dest);
6438rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6440 str_modify_keep_cr(str);
6441 return str_gsub(argc, argv, str, 1);
6464 return str_gsub(argc, argv, str, 0);
6482 str_modifiable(str);
6483 if (str == str2)
return str;
6487 return str_replace(str, str2);
6502rb_str_clear(
VALUE str)
6506 STR_SET_LEN(str, 0);
6507 RSTRING_PTR(str)[0] = 0;
6508 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6527rb_str_chr(
VALUE str)
6551 pos += RSTRING_LEN(str);
6552 if (pos < 0 || RSTRING_LEN(str) <= pos)
6555 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6574 long len = RSTRING_LEN(str);
6575 char *ptr, *head, *left = 0;
6579 if (pos < -
len ||
len <= pos)
6586 char byte = (char)(
NUM2INT(w) & 0xFF);
6588 if (!str_independent(str))
6589 str_make_independent(str);
6590 enc = STR_ENC_GET(str);
6591 head = RSTRING_PTR(str);
6593 if (!STR_EMBED_P(str)) {
6600 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6608 width = rb_enc_precise_mbclen(left, head+
len, enc);
6610 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6626str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6628 long n = RSTRING_LEN(str);
6630 if (beg > n ||
len < 0)
return Qnil;
6633 if (beg < 0)
return Qnil;
6638 if (!empty)
return Qnil;
6642 VALUE str2 = str_subseq(str, beg,
len);
6644 str_enc_copy_direct(str2, str);
6646 if (RSTRING_LEN(str2) == 0) {
6647 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6681 long beg,
len = RSTRING_LEN(str);
6689 return str_byte_substr(str, beg,
len, TRUE);
6694 return str_byte_substr(str, idx, 1, FALSE);
6741rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6746 return str_byte_substr(str, beg,
len, TRUE);
6749 return str_byte_aref(str, argv[0]);
6753str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6755 long end, slen = RSTRING_LEN(str);
6758 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6767 if (*
len > slen - *beg) {
6771 str_ensure_byte_pos(str, *beg);
6772 str_ensure_byte_pos(str, end);
6797rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6799 long beg,
len, vbeg, vlen;
6804 if (!(argc == 2 || argc == 3 || argc == 5)) {
6805 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6809 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6810 rb_builtin_class_name(argv[0]));
6817 vlen = RSTRING_LEN(val);
6822 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6823 rb_builtin_class_name(argv[2]));
6835 vlen = RSTRING_LEN(val);
6843 str_check_beg_len(str, &beg, &
len);
6844 str_check_beg_len(val, &vbeg, &vlen);
6845 str_modify_keep_cr(str);
6848 rb_enc_associate(str, rb_enc_check(str, val));
6851 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6869rb_str_reverse(
VALUE str)
6876 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6877 enc = STR_ENC_GET(str);
6883 if (RSTRING_LEN(str) > 1) {
6884 if (single_byte_optimizable(str)) {
6891 int clen = rb_enc_fast_mbclen(s, e, enc);
6899 cr = rb_enc_asciicompat(enc) ?
6902 int clen = rb_enc_mbclen(s, e, enc);
6911 STR_SET_LEN(rev, RSTRING_LEN(str));
6912 str_enc_copy_direct(rev, str);
6932rb_str_reverse_bang(
VALUE str)
6934 if (RSTRING_LEN(str) > 1) {
6935 if (single_byte_optimizable(str)) {
6938 str_modify_keep_cr(str);
6939 s = RSTRING_PTR(str);
6948 str_shared_replace(str, rb_str_reverse(str));
6952 str_modify_keep_cr(str);
6977 i = rb_str_index(str, arg, 0);
6979 return RBOOL(i != -1);
7021 rb_raise(rb_eArgError,
"invalid radix %d", base);
7023 return rb_str_to_inum(str, base, FALSE);
7047rb_str_to_f(
VALUE str)
7062rb_str_to_s(
VALUE str)
7072str_cat_char(
VALUE str,
unsigned int c, rb_encoding *enc)
7074 char s[RUBY_MAX_CHAR_LEN];
7075 int n = rb_enc_codelen(c, enc);
7077 rb_enc_mbcput(c, s, enc);
7082#define CHAR_ESC_LEN 13
7085rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7087 char buf[CHAR_ESC_LEN + 1];
7095 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7097 else if (c < 0x10000) {
7098 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7101 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7106 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7109 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7112 l = (int)strlen(buf);
7118ruby_escaped_char(
int c)
7121 case '\0':
return "\\0";
7122 case '\n':
return "\\n";
7123 case '\r':
return "\\r";
7124 case '\t':
return "\\t";
7125 case '\f':
return "\\f";
7126 case '\013':
return "\\v";
7127 case '\010':
return "\\b";
7128 case '\007':
return "\\a";
7129 case '\033':
return "\\e";
7130 case '\x7f':
return "\\c?";
7136rb_str_escape(
VALUE str)
7139 rb_encoding *enc = rb_enc_from_index(encidx);
7140 const char *p = RSTRING_PTR(str);
7142 const char *prev = p;
7143 char buf[CHAR_ESC_LEN + 1];
7145 int unicode_p = rb_enc_unicode_p(enc);
7146 int asciicompat = rb_enc_asciicompat(enc);
7151 int n = rb_enc_precise_mbclen(p, pend, enc);
7153 if (p > prev) str_buf_cat(result, prev, p - prev);
7154 n = rb_enc_mbminlen(enc);
7156 n = (int)(pend - p);
7158 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7159 str_buf_cat(result, buf, strlen(buf));
7165 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7167 cc = ruby_escaped_char(c);
7169 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7170 str_buf_cat(result, cc, strlen(cc));
7173 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7176 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7177 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7181 if (p > prev) str_buf_cat(result, prev, p - prev);
7204 rb_encoding *enc = rb_enc_from_index(encidx);
7205 const char *p, *pend, *prev;
7206 char buf[CHAR_ESC_LEN + 1];
7208 rb_encoding *resenc = rb_default_internal_encoding();
7209 int unicode_p = rb_enc_unicode_p(enc);
7210 int asciicompat = rb_enc_asciicompat(enc);
7212 if (resenc == NULL) resenc = rb_default_external_encoding();
7213 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7214 rb_enc_associate(result, resenc);
7215 str_buf_cat2(result,
"\"");
7223 n = rb_enc_precise_mbclen(p, pend, enc);
7225 if (p > prev) str_buf_cat(result, prev, p - prev);
7226 n = rb_enc_mbminlen(enc);
7228 n = (int)(pend - p);
7230 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7231 str_buf_cat(result, buf, strlen(buf));
7237 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7239 if ((asciicompat || unicode_p) &&
7240 (c ==
'"'|| c ==
'\\' ||
7245 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7246 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7247 str_buf_cat2(result,
"\\");
7248 if (asciicompat || enc == resenc) {
7254 case '\n': cc =
'n';
break;
7255 case '\r': cc =
'r';
break;
7256 case '\t': cc =
't';
break;
7257 case '\f': cc =
'f';
break;
7258 case '\013': cc =
'v';
break;
7259 case '\010': cc =
'b';
break;
7260 case '\007': cc =
'a';
break;
7261 case 033: cc =
'e';
break;
7262 default: cc = 0;
break;
7265 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7268 str_buf_cat(result, buf, 2);
7281 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7285 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7286 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7291 if (p > prev) str_buf_cat(result, prev, p - prev);
7292 str_buf_cat2(result,
"\"");
7297#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7317 int encidx = rb_enc_get_index(str);
7318 rb_encoding *enc = rb_enc_from_index(encidx);
7320 const char *p, *pend;
7323 int u8 = (encidx == rb_utf8_encindex());
7324 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7327 if (!rb_enc_asciicompat(enc)) {
7329 len += strlen(enc->name);
7332 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7335 unsigned char c = *p++;
7338 case '"':
case '\\':
7339 case '\n':
case '\r':
7340 case '\t':
case '\f':
7341 case '\013':
case '\010':
case '\007':
case '\033':
7346 clen = IS_EVSTR(p, pend) ? 2 : 1;
7354 if (u8 && c > 0x7F) {
7355 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7357 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7360 else if (cc <= 0xFFFFF)
7373 if (clen > LONG_MAX -
len) {
7380 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7381 q = RSTRING_PTR(result); qend = q +
len + 1;
7385 unsigned char c = *p++;
7387 if (c ==
'"' || c ==
'\\') {
7391 else if (c ==
'#') {
7392 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7395 else if (c ==
'\n') {
7399 else if (c ==
'\r') {
7403 else if (c ==
'\t') {
7407 else if (c ==
'\f') {
7411 else if (c ==
'\013') {
7415 else if (c ==
'\010') {
7419 else if (c ==
'\007') {
7423 else if (c ==
'\033') {
7433 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7435 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7438 snprintf(q, qend-q,
"u%04X", cc);
7440 snprintf(q, qend-q,
"u{%X}", cc);
7445 snprintf(q, qend-q,
"x%02X", c);
7451 if (!rb_enc_asciicompat(enc)) {
7452 snprintf(q, qend-q, nonascii_suffix, enc->name);
7453 encidx = rb_ascii8bit_encindex();
7456 rb_enc_associate_index(result, encidx);
7462unescape_ascii(
unsigned int c)
7486undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end, rb_encoding **penc,
bool *utf8,
bool *binary)
7488 const char *s = *ss;
7492 unsigned char buf[6];
7493 static rb_encoding *enc_utf8 = NULL;
7510 *buf = unescape_ascii(*s);
7522 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7523 if (*penc != enc_utf8) {
7525 rb_enc_associate(undumped, enc_utf8);
7542 if (hexlen == 0 || hexlen > 6) {
7548 if (0xd800 <= c && c <= 0xdfff) {
7551 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7561 if (0xd800 <= c && c <= 0xdfff) {
7564 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7592static VALUE rb_str_is_ascii_only_p(
VALUE str);
7610str_undump(
VALUE str)
7612 const char *s = RSTRING_PTR(str);
7614 rb_encoding *enc = rb_enc_get(str);
7615 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7617 bool binary =
false;
7621 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7624 if (!str_null_check(str, &w)) {
7627 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7628 if (*s !=
'"')
goto invalid_format;
7646 static const char force_encoding_suffix[] =
".force_encoding(\"";
7647 static const char dup_suffix[] =
".dup";
7648 const char *encname;
7653 size =
sizeof(dup_suffix) - 1;
7654 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7656 size =
sizeof(force_encoding_suffix) - 1;
7657 if (s_end - s <= size)
goto invalid_format;
7658 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7662 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7666 s = memchr(s,
'"', s_end-s);
7668 if (!s)
goto invalid_format;
7669 if (s_end - s != 2)
goto invalid_format;
7670 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7672 encidx = rb_enc_find_index2(encname, (
long)size);
7676 rb_enc_associate_index(undumped, encidx);
7686 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7697 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7701rb_str_check_dummy_enc(rb_encoding *enc)
7703 if (rb_enc_dummy_p(enc)) {
7710str_true_enc(
VALUE str)
7712 rb_encoding *enc = STR_ENC_GET(str);
7713 rb_str_check_dummy_enc(enc);
7717static OnigCaseFoldType
7718check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7723 rb_raise(rb_eArgError,
"too many options");
7724 if (argv[0]==sym_turkic) {
7725 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7727 if (argv[1]==sym_lithuanian)
7728 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7730 rb_raise(rb_eArgError,
"invalid second option");
7733 else if (argv[0]==sym_lithuanian) {
7734 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7736 if (argv[1]==sym_turkic)
7737 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7739 rb_raise(rb_eArgError,
"invalid second option");
7743 rb_raise(rb_eArgError,
"too many options");
7744 else if (argv[0]==sym_ascii)
7745 flags |= ONIGENC_CASE_ASCII_ONLY;
7746 else if (argv[0]==sym_fold) {
7747 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7748 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7750 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7753 rb_raise(rb_eArgError,
"invalid option");
7758case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc,
VALUE str)
7760 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7766#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7767#ifndef CASEMAP_DEBUG
7768# define CASEMAP_DEBUG 0
7776 OnigUChar space[FLEX_ARY_LEN];
7780mapping_buffer_free(
void *p)
7784 while (current_buffer) {
7785 previous_buffer = current_buffer;
7786 current_buffer = current_buffer->next;
7787 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7793 {0, mapping_buffer_free,},
7794 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7798rb_str_casemap(
VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7802 const OnigUChar *source_current, *source_end;
7803 int target_length = 0;
7804 VALUE buffer_anchor;
7807 size_t buffer_count = 0;
7808 int buffer_length_or_invalid;
7810 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7812 source_current = (OnigUChar*)RSTRING_PTR(source);
7817 while (source_current < source_end) {
7819 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7820 if (CASEMAP_DEBUG) {
7821 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7824 *pre_buffer = current_buffer;
7825 pre_buffer = ¤t_buffer->next;
7826 current_buffer->next = NULL;
7827 current_buffer->capa =
capa;
7828 buffer_length_or_invalid = enc->case_map(flags,
7829 &source_current, source_end,
7830 current_buffer->space,
7831 current_buffer->space+current_buffer->capa,
7833 if (buffer_length_or_invalid < 0) {
7834 current_buffer =
DATA_PTR(buffer_anchor);
7836 mapping_buffer_free(current_buffer);
7837 rb_raise(rb_eArgError,
"input string invalid");
7839 target_length += current_buffer->used = buffer_length_or_invalid;
7841 if (CASEMAP_DEBUG) {
7842 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7845 if (buffer_count==1) {
7846 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7849 char *target_current;
7852 target_current = RSTRING_PTR(target);
7853 current_buffer =
DATA_PTR(buffer_anchor);
7854 while (current_buffer) {
7855 memcpy(target_current, current_buffer->space, current_buffer->used);
7856 target_current += current_buffer->used;
7857 current_buffer = current_buffer->next;
7860 current_buffer =
DATA_PTR(buffer_anchor);
7862 mapping_buffer_free(current_buffer);
7867 str_enc_copy_direct(target, source);
7874rb_str_ascii_casemap(
VALUE source,
VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7876 const OnigUChar *source_current, *source_end;
7877 OnigUChar *target_current, *target_end;
7878 long old_length = RSTRING_LEN(source);
7879 int length_or_invalid;
7881 if (old_length == 0)
return Qnil;
7883 source_current = (OnigUChar*)RSTRING_PTR(source);
7885 if (source == target) {
7886 target_current = (OnigUChar*)source_current;
7887 target_end = (OnigUChar*)source_end;
7890 target_current = (OnigUChar*)RSTRING_PTR(target);
7894 length_or_invalid = onigenc_ascii_only_case_map(flags,
7895 &source_current, source_end,
7896 target_current, target_end, enc);
7897 if (length_or_invalid < 0)
7898 rb_raise(rb_eArgError,
"input string invalid");
7899 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7900 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7901 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7902 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7903 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7906 str_enc_copy(target, source);
7912upcase_single(
VALUE str)
7914 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
7915 bool modified =
false;
7918 unsigned int c = *(
unsigned char*)s;
7920 if (
'a' <= c && c <=
'z') {
7921 *s =
'A' + (c -
'a');
7949rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7952 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7954 flags = check_case_options(argc, argv, flags);
7955 str_modify_keep_cr(str);
7956 enc = str_true_enc(str);
7957 if (case_option_single_p(flags, enc, str)) {
7958 if (upcase_single(str))
7959 flags |= ONIGENC_CASE_MODIFIED;
7961 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7962 rb_str_ascii_casemap(str, str, &flags, enc);
7964 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7966 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7988rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
7991 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7994 flags = check_case_options(argc, argv, flags);
7995 enc = str_true_enc(str);
7996 if (case_option_single_p(flags, enc, str)) {
7997 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7998 str_enc_copy_direct(ret, str);
8001 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8003 rb_str_ascii_casemap(str, ret, &flags, enc);
8006 ret = rb_str_casemap(str, &flags, enc);
8013downcase_single(
VALUE str)
8015 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8016 bool modified =
false;
8019 unsigned int c = *(
unsigned char*)s;
8021 if (
'A' <= c && c <=
'Z') {
8022 *s =
'a' + (c -
'A');
8051rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8054 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8056 flags = check_case_options(argc, argv, flags);
8057 str_modify_keep_cr(str);
8058 enc = str_true_enc(str);
8059 if (case_option_single_p(flags, enc, str)) {
8060 if (downcase_single(str))
8061 flags |= ONIGENC_CASE_MODIFIED;
8063 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8064 rb_str_ascii_casemap(str, str, &flags, enc);
8066 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8068 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8090rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8093 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8096 flags = check_case_options(argc, argv, flags);
8097 enc = str_true_enc(str);
8098 if (case_option_single_p(flags, enc, str)) {
8099 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8100 str_enc_copy_direct(ret, str);
8101 downcase_single(ret);
8103 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8105 rb_str_ascii_casemap(str, ret, &flags, enc);
8108 ret = rb_str_casemap(str, &flags, enc);
8136rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8139 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8141 flags = check_case_options(argc, argv, flags);
8142 str_modify_keep_cr(str);
8143 enc = str_true_enc(str);
8144 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8145 if (flags&ONIGENC_CASE_ASCII_ONLY)
8146 rb_str_ascii_casemap(str, str, &flags, enc);
8148 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8150 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8174rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8177 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8180 flags = check_case_options(argc, argv, flags);
8181 enc = str_true_enc(str);
8182 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8183 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8185 rb_str_ascii_casemap(str, ret, &flags, enc);
8188 ret = rb_str_casemap(str, &flags, enc);
8215rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8218 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8220 flags = check_case_options(argc, argv, flags);
8221 str_modify_keep_cr(str);
8222 enc = str_true_enc(str);
8223 if (flags&ONIGENC_CASE_ASCII_ONLY)
8224 rb_str_ascii_casemap(str, str, &flags, enc);
8226 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8228 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8252rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8255 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8258 flags = check_case_options(argc, argv, flags);
8259 enc = str_true_enc(str);
8260 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8261 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8263 rb_str_ascii_casemap(str, ret, &flags, enc);
8266 ret = rb_str_casemap(str, &flags, enc);
8271typedef unsigned char *USTR;
8275 unsigned int now, max;
8280trnext(
struct tr *t, rb_encoding *enc)
8287 if (t->p == t->pend)
return -1;
8288 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8291 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8293 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8295 if (t->p < t->pend) {
8296 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8299 if (t->now < 0x80 && c < 0x80) {
8300 rb_raise(rb_eArgError,
8301 "invalid range \"%c-%c\" in string transliteration",
8305 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8309 else if (t->now < c) {
8318 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8319 if (t->now == t->max) {
8324 if (t->now < t->max) {
8340 const unsigned int errc = -1;
8341 unsigned int trans[256];
8342 rb_encoding *enc, *e1, *e2;
8343 struct tr trsrc, trrepl;
8345 unsigned int c, c0, last = 0;
8346 int modify = 0, i, l;
8347 unsigned char *s, *send;
8349 int singlebyte = single_byte_optimizable(str);
8353#define CHECK_IF_ASCII(c) \
8354 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8355 (cr = ENC_CODERANGE_VALID) : 0)
8359 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8360 if (RSTRING_LEN(repl) == 0) {
8361 return rb_str_delete_bang(1, &src, str);
8365 e1 = rb_enc_check(str, src);
8366 e2 = rb_enc_check(str, repl);
8371 enc = rb_enc_check(src, repl);
8373 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8374 if (RSTRING_LEN(src) > 1 &&
8375 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8376 trsrc.p + l < trsrc.pend) {
8380 trrepl.p = RSTRING_PTR(repl);
8381 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8382 trsrc.gen = trrepl.gen = 0;
8383 trsrc.now = trrepl.now = 0;
8384 trsrc.max = trrepl.max = 0;
8387 for (i=0; i<256; i++) {
8390 while ((c = trnext(&trsrc, enc)) != errc) {
8395 if (!hash) hash = rb_hash_new();
8399 while ((c = trnext(&trrepl, enc)) != errc)
8402 for (i=0; i<256; i++) {
8403 if (trans[i] != errc) {
8411 for (i=0; i<256; i++) {
8414 while ((c = trnext(&trsrc, enc)) != errc) {
8415 r = trnext(&trrepl, enc);
8416 if (r == errc) r = trrepl.now;
8419 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8422 if (!hash) hash = rb_hash_new();
8430 str_modify_keep_cr(str);
8431 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8432 termlen = rb_enc_mbminlen(enc);
8435 long offset, max = RSTRING_LEN(str);
8436 unsigned int save = -1;
8437 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8442 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8445 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8448 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8450 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8459 if (cflag) c = last;
8462 else if (cflag) c = errc;
8468 if (c != (
unsigned int)-1) {
8474 tlen = rb_enc_codelen(c, enc);
8480 if (enc != e1) may_modify = 1;
8482 if ((offset = t - buf) + tlen > max) {
8483 size_t MAYBE_UNUSED(old) = max + termlen;
8484 max = offset + tlen + (send - s);
8485 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8488 rb_enc_mbcput(c, t, enc);
8489 if (may_modify && memcmp(s, t, tlen) != 0) {
8495 if (!STR_EMBED_P(str)) {
8496 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8498 TERM_FILL((
char *)t, termlen);
8499 RSTRING(str)->as.heap.ptr = (
char *)buf;
8500 STR_SET_LEN(str, t - buf);
8501 STR_SET_NOEMBED(str);
8502 RSTRING(str)->as.heap.aux.capa = max;
8506 c = (
unsigned char)*s;
8507 if (trans[c] != errc) {
8524 long offset, max = (long)((send - s) * 1.2);
8525 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8530 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8533 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8536 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8538 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8546 if (cflag) c = last;
8549 else if (cflag) c = errc;
8553 c = cflag ? last : errc;
8556 tlen = rb_enc_codelen(c, enc);
8561 if (enc != e1) may_modify = 1;
8563 if ((offset = t - buf) + tlen > max) {
8564 size_t MAYBE_UNUSED(old) = max + termlen;
8565 max = offset + tlen + (long)((send - s) * 1.2);
8566 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8570 rb_enc_mbcput(c, t, enc);
8571 if (may_modify && memcmp(s, t, tlen) != 0) {
8579 if (!STR_EMBED_P(str)) {
8580 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8582 TERM_FILL((
char *)t, termlen);
8583 RSTRING(str)->as.heap.ptr = (
char *)buf;
8584 STR_SET_LEN(str, t - buf);
8585 STR_SET_NOEMBED(str);
8586 RSTRING(str)->as.heap.aux.capa = max;
8592 rb_enc_associate(str, enc);
8611 return tr_trans(str, src, repl, 0);
8658 tr_trans(str, src, repl, 0);
8662#define TR_TABLE_MAX (UCHAR_MAX+1)
8663#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8665tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8666 VALUE *tablep,
VALUE *ctablep, rb_encoding *enc)
8668 const unsigned int errc = -1;
8669 char buf[TR_TABLE_MAX];
8672 VALUE table = 0, ptable = 0;
8673 int i, l, cflag = 0;
8675 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8676 tr.gen =
tr.now =
tr.max = 0;
8678 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8683 for (i=0; i<TR_TABLE_MAX; i++) {
8686 stable[TR_TABLE_MAX] = cflag;
8688 else if (stable[TR_TABLE_MAX] && !cflag) {
8689 stable[TR_TABLE_MAX] = 0;
8691 for (i=0; i<TR_TABLE_MAX; i++) {
8695 while ((c = trnext(&
tr, enc)) != errc) {
8696 if (c < TR_TABLE_MAX) {
8697 buf[(
unsigned char)c] = !cflag;
8702 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8705 table = ptable ? ptable : rb_hash_new();
8709 table = rb_hash_new();
8714 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8715 rb_hash_aset(table, key,
Qtrue);
8719 for (i=0; i<TR_TABLE_MAX; i++) {
8720 stable[i] = stable[i] && buf[i];
8722 if (!table && !cflag) {
8729tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8731 if (c < TR_TABLE_MAX) {
8732 return table[c] != 0;
8738 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8739 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8743 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8746 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8760rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8762 char squeez[TR_TABLE_SIZE];
8763 rb_encoding *enc = 0;
8765 VALUE del = 0, nodel = 0;
8767 int i, ascompat, cr;
8769 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8771 for (i=0; i<argc; i++) {
8775 enc = rb_enc_check(str, s);
8776 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8779 str_modify_keep_cr(str);
8780 ascompat = rb_enc_asciicompat(enc);
8781 s = t = RSTRING_PTR(str);
8788 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8799 c = rb_enc_codepoint_len(s, send, &clen, enc);
8801 if (tr_find(c, squeez, del, nodel)) {
8805 if (t != s) rb_enc_mbcput(c, t, enc);
8812 TERM_FILL(t, TERM_LEN(str));
8813 STR_SET_LEN(str, t - RSTRING_PTR(str));
8816 if (modify)
return str;
8836rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8839 rb_str_delete_bang(argc, argv, str);
8853rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8855 char squeez[TR_TABLE_SIZE];
8856 rb_encoding *enc = 0;
8857 VALUE del = 0, nodel = 0;
8858 unsigned char *s, *send, *t;
8860 int ascompat, singlebyte = single_byte_optimizable(str);
8864 enc = STR_ENC_GET(str);
8867 for (i=0; i<argc; i++) {
8871 enc = rb_enc_check(str, s);
8872 if (singlebyte && !single_byte_optimizable(s))
8874 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8878 str_modify_keep_cr(str);
8879 s = t = (
unsigned char *)RSTRING_PTR(str);
8880 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8883 ascompat = rb_enc_asciicompat(enc);
8887 unsigned int c = *s++;
8888 if (c != save || (argc > 0 && !squeez[c])) {
8898 if (ascompat && (c = *s) < 0x80) {
8899 if (c != save || (argc > 0 && !squeez[c])) {
8905 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8907 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8908 if (t != s) rb_enc_mbcput(c, t, enc);
8917 TERM_FILL((
char *)t, TERM_LEN(str));
8918 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8919 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8923 if (modify)
return str;
8946rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8949 rb_str_squeeze_bang(argc, argv, str);
8967 return tr_trans(str, src, repl, 1);
8990 tr_trans(str, src, repl, 1);
9019rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9021 char table[TR_TABLE_SIZE];
9022 rb_encoding *enc = 0;
9023 VALUE del = 0, nodel = 0, tstr;
9033 enc = rb_enc_check(str, tstr);
9036 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9037 (ptstr = RSTRING_PTR(tstr),
9038 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9039 !is_broken_string(str)) {
9041 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9043 s = RSTRING_PTR(str);
9044 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9047 if (*(
unsigned char*)s++ == c) n++;
9053 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9054 for (i=1; i<argc; i++) {
9057 enc = rb_enc_check(str, tstr);
9058 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9061 s = RSTRING_PTR(str);
9062 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9064 ascompat = rb_enc_asciicompat(enc);
9068 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9076 c = rb_enc_codepoint_len(s, send, &clen, enc);
9077 if (tr_find(c, table, del, nodel)) {
9088rb_fs_check(
VALUE val)
9092 if (
NIL_P(val))
return 0;
9097static const char isspacetable[256] = {
9098 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9100 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9116#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9119split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9121 if (empty_count >= 0 &&
len == 0) {
9122 return empty_count + 1;
9124 if (empty_count > 0) {
9128 rb_ary_push(result, str_new_empty_String(str));
9129 }
while (--empty_count > 0);
9133 rb_yield(str_new_empty_String(str));
9134 }
while (--empty_count > 0);
9139 rb_ary_push(result, str);
9148 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9152literal_split_pattern(
VALUE spat, split_type_t default_type)
9154 rb_encoding *enc = STR_ENC_GET(spat);
9160 return SPLIT_TYPE_CHARS;
9162 else if (rb_enc_asciicompat(enc)) {
9163 if (
len == 1 && ptr[0] ==
' ') {
9164 return SPLIT_TYPE_AWK;
9169 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9170 return SPLIT_TYPE_AWK;
9173 return default_type;
9186rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9191 split_type_t split_type;
9192 long beg, end, i = 0, empty_count = -1;
9197 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9199 if (lim <= 0) limit =
Qnil;
9200 else if (lim == 1) {
9201 if (RSTRING_LEN(str) == 0)
9212 if (
NIL_P(limit) && !lim) empty_count = 0;
9214 enc = STR_ENC_GET(str);
9215 split_type = SPLIT_TYPE_REGEXP;
9217 spat = get_pat_quoted(spat, 0);
9220 split_type = SPLIT_TYPE_AWK;
9222 else if (!(spat = rb_fs_check(spat))) {
9223 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9228 if (split_type != SPLIT_TYPE_AWK) {
9233 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9234 if (split_type == SPLIT_TYPE_AWK) {
9236 split_type = SPLIT_TYPE_STRING;
9241 mustnot_broken(spat);
9242 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9250#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9253 char *ptr = RSTRING_PTR(str);
9255 if (split_type == SPLIT_TYPE_AWK) {
9260 if (result) result = rb_ary_new();
9262 if (is_ascii_string(str)) {
9263 while (ptr < eptr) {
9264 c = (
unsigned char)*ptr++;
9266 if (ascii_isspace(c)) {
9272 if (!
NIL_P(limit) && lim <= i)
break;
9275 else if (ascii_isspace(c)) {
9276 SPLIT_STR(beg, end-beg);
9279 if (!
NIL_P(limit)) ++i;
9287 while (ptr < eptr) {
9290 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9299 if (!
NIL_P(limit) && lim <= i)
break;
9303 SPLIT_STR(beg, end-beg);
9306 if (!
NIL_P(limit)) ++i;
9314 else if (split_type == SPLIT_TYPE_STRING) {
9315 char *str_start = ptr;
9316 char *substr_start = ptr;
9317 char *sptr = RSTRING_PTR(spat);
9318 long slen = RSTRING_LEN(spat);
9320 if (result) result = rb_ary_new();
9321 mustnot_broken(str);
9322 enc = rb_enc_check(str, spat);
9323 while (ptr < eptr &&
9324 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9327 if (t != ptr + end) {
9331 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9334 if (!
NIL_P(limit) && lim <= ++i)
break;
9336 beg = ptr - str_start;
9338 else if (split_type == SPLIT_TYPE_CHARS) {
9339 char *str_start = ptr;
9342 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9343 mustnot_broken(str);
9344 enc = rb_enc_get(str);
9345 while (ptr < eptr &&
9346 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9347 SPLIT_STR(ptr - str_start, n);
9349 if (!
NIL_P(limit) && lim <= ++i)
break;
9351 beg = ptr - str_start;
9354 if (result) result = rb_ary_new();
9355 long len = RSTRING_LEN(str);
9363 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9368 if (start == end && BEG(0) == END(0)) {
9373 else if (last_null == 1) {
9374 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9381 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9387 SPLIT_STR(beg, end-beg);
9388 beg = start = END(0);
9392 for (idx=1; idx < regs->num_regs; idx++) {
9393 if (BEG(idx) == -1)
continue;
9394 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9396 if (!
NIL_P(limit) && lim <= ++i)
break;
9398 if (match) rb_match_unbusy(match);
9400 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9401 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9404 return result ? result : str;
9414 return rb_str_split_m(1, &sep, str);
9417#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9423 rb_ary_push(ary, e);
9432#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9435chomp_newline(
const char *p,
const char *e, rb_encoding *enc)
9437 const char *prev = rb_enc_prev_char(p, e, e, enc);
9440 prev = rb_enc_prev_char(p, e, e, enc);
9441 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9453 RSTRING_LEN(rs) != 1 ||
9454 RSTRING_PTR(rs)[0] !=
'\n')) {
9460#define rb_rs get_rs()
9467 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9468 long pos,
len, rslen;
9474 static ID keywords[1];
9479 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9483 if (!ENUM_ELEM(ary, str)) {
9491 if (!RSTRING_LEN(str))
goto end;
9493 ptr = subptr = RSTRING_PTR(str);
9495 len = RSTRING_LEN(str);
9497 rslen = RSTRING_LEN(rs);
9500 enc = rb_enc_get(str);
9502 enc = rb_enc_check(str, rs);
9507 const char *eol = NULL;
9509 while (subend < pend) {
9510 long chomp_rslen = 0;
9512 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9514 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9516 if (eol == subend)
break;
9520 chomp_rslen = -rslen;
9524 if (!subptr) subptr = subend;
9528 }
while (subend < pend);
9530 if (rslen == 0) chomp_rslen = 0;
9532 subend - subptr + (chomp ? chomp_rslen : rslen));
9533 if (ENUM_ELEM(ary, line)) {
9534 str_mod_check(str, ptr,
len);
9536 subptr = eol = NULL;
9541 rsptr = RSTRING_PTR(rs);
9542 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9551 rsptr = RSTRING_PTR(rs);
9552 rslen = RSTRING_LEN(rs);
9555 while (subptr < pend) {
9556 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9560 if (hit != adjusted) {
9564 subend = hit += rslen;
9567 subend = chomp_newline(subptr, subend, enc);
9574 if (ENUM_ELEM(ary, line)) {
9575 str_mod_check(str, ptr,
len);
9580 if (subptr != pend) {
9583 pend = chomp_newline(subptr, pend, enc);
9585 else if (pend - subptr >= rslen &&
9586 memcmp(pend - rslen, rsptr, rslen) == 0) {
9591 ENUM_ELEM(ary, line);
9612rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9615 return rb_str_enumerate_lines(argc, argv, str, 0);
9628rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9630 VALUE ary = WANTARRAY(
"lines", 0);
9631 return rb_str_enumerate_lines(argc, argv, str, ary);
9645 for (i=0; i<RSTRING_LEN(str); i++) {
9646 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9664rb_str_each_byte(
VALUE str)
9667 return rb_str_enumerate_bytes(str, 0);
9679rb_str_bytes(
VALUE str)
9681 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9682 return rb_str_enumerate_bytes(str, ary);
9700 ptr = RSTRING_PTR(str);
9701 len = RSTRING_LEN(str);
9702 enc = rb_enc_get(str);
9705 for (i = 0; i <
len; i += n) {
9706 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9711 for (i = 0; i <
len; i += n) {
9712 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9733rb_str_each_char(
VALUE str)
9736 return rb_str_enumerate_chars(str, 0);
9748rb_str_chars(
VALUE str)
9751 return rb_str_enumerate_chars(str, ary);
9755rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9760 const char *ptr, *end;
9763 if (single_byte_optimizable(str))
9764 return rb_str_enumerate_bytes(str, ary);
9767 ptr = RSTRING_PTR(str);
9769 enc = STR_ENC_GET(str);
9772 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9793rb_str_each_codepoint(
VALUE str)
9796 return rb_str_enumerate_codepoints(str, 0);
9808rb_str_codepoints(
VALUE str)
9811 return rb_str_enumerate_codepoints(str, ary);
9815get_reg_grapheme_cluster(rb_encoding *enc)
9817 int encidx = rb_enc_to_index(enc);
9819 const OnigUChar source_ascii[] =
"\\X";
9820 const OnigUChar *source = source_ascii;
9821 size_t source_len =
sizeof(source_ascii) - 1;
9824#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9825#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9826#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9827#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9828#define CASE_UTF(e) \
9829 case ENCINDEX_UTF_##e: { \
9830 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9831 source = source_UTF_##e; \
9832 source_len = sizeof(source_UTF_##e); \
9835 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9843 regex_t *reg_grapheme_cluster;
9845 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9846 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9848 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9849 onig_error_code_to_str(message, r, &einfo);
9850 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9853 return reg_grapheme_cluster;
9857get_cached_reg_grapheme_cluster(rb_encoding *enc)
9859 int encidx = rb_enc_to_index(enc);
9860 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9862 if (encidx == rb_utf8_encindex()) {
9863 if (!reg_grapheme_cluster_utf8) {
9864 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9867 return reg_grapheme_cluster_utf8;
9876 size_t grapheme_cluster_count = 0;
9877 rb_encoding *enc = get_encoding(str);
9878 const char *ptr, *end;
9880 if (!rb_enc_unicode_p(enc)) {
9884 bool cached_reg_grapheme_cluster =
true;
9885 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9886 if (!reg_grapheme_cluster) {
9887 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9888 cached_reg_grapheme_cluster =
false;
9891 ptr = RSTRING_PTR(str);
9895 OnigPosition
len = onig_match(reg_grapheme_cluster,
9896 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9897 (
const OnigUChar *)ptr, NULL, 0);
9898 if (
len <= 0)
break;
9899 grapheme_cluster_count++;
9903 if (!cached_reg_grapheme_cluster) {
9904 onig_free(reg_grapheme_cluster);
9907 return SIZET2NUM(grapheme_cluster_count);
9911rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9914 rb_encoding *enc = get_encoding(str);
9915 const char *ptr0, *ptr, *end;
9917 if (!rb_enc_unicode_p(enc)) {
9918 return rb_str_enumerate_chars(str, ary);
9923 bool cached_reg_grapheme_cluster =
true;
9924 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9925 if (!reg_grapheme_cluster) {
9926 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9927 cached_reg_grapheme_cluster =
false;
9930 ptr0 = ptr = RSTRING_PTR(str);
9934 OnigPosition
len = onig_match(reg_grapheme_cluster,
9935 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9936 (
const OnigUChar *)ptr, NULL, 0);
9937 if (
len <= 0)
break;
9942 if (!cached_reg_grapheme_cluster) {
9943 onig_free(reg_grapheme_cluster);
9963rb_str_each_grapheme_cluster(
VALUE str)
9966 return rb_str_enumerate_grapheme_clusters(str, 0);
9978rb_str_grapheme_clusters(
VALUE str)
9981 return rb_str_enumerate_grapheme_clusters(str, ary);
9985chopped_length(
VALUE str)
9987 rb_encoding *enc = STR_ENC_GET(str);
9988 const char *p, *p2, *beg, *end;
9990 beg = RSTRING_PTR(str);
9991 end = beg + RSTRING_LEN(str);
9992 if (beg >= end)
return 0;
9993 p = rb_enc_prev_char(beg, end, end, enc);
9995 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
9996 p2 = rb_enc_prev_char(beg, p, end, enc);
9997 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10013rb_str_chop_bang(
VALUE str)
10015 str_modify_keep_cr(str);
10016 if (RSTRING_LEN(str) > 0) {
10018 len = chopped_length(str);
10019 STR_SET_LEN(str,
len);
10020 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10039rb_str_chop(
VALUE str)
10045smart_chomp(
VALUE str,
const char *e,
const char *p)
10047 rb_encoding *enc = rb_enc_get(str);
10048 if (rb_enc_mbminlen(enc) > 1) {
10053 pp = e - rb_enc_mbminlen(enc);
10056 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10064 if (--e > p && *(e-1) ==
'\r') {
10081 char *pp, *e, *rsptr;
10083 char *
const p = RSTRING_PTR(str);
10084 long len = RSTRING_LEN(str);
10086 if (
len == 0)
return 0;
10089 return smart_chomp(str, e, p);
10092 enc = rb_enc_get(str);
10095 if (rb_enc_mbminlen(enc) > 1) {
10100 pp -= rb_enc_mbminlen(enc);
10103 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10110 while (e > p && *(e-1) ==
'\n') {
10112 if (e > p && *(e-1) ==
'\r')
10118 if (rslen >
len)
return len;
10120 enc = rb_enc_get(rs);
10121 newline = rsptr[rslen-1];
10122 if (rslen == rb_enc_mbminlen(enc)) {
10124 if (newline ==
'\n')
10125 return smart_chomp(str, e, p);
10129 return smart_chomp(str, e, p);
10133 enc = rb_enc_check(str, rs);
10134 if (is_broken_string(rs)) {
10138 if (p[
len-1] == newline &&
10140 memcmp(rsptr, pp, rslen) == 0)) {
10141 if (at_char_boundary(p, pp, e, enc))
10142 return len - rslen;
10154chomp_rs(
int argc,
const VALUE *argv)
10158 VALUE rs = argv[0];
10170 long olen = RSTRING_LEN(str);
10171 long len = chompped_length(str, rs);
10172 if (
len >= olen)
return Qnil;
10173 str_modify_keep_cr(str);
10174 STR_SET_LEN(str,
len);
10175 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10192rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10195 str_modifiable(str);
10196 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10197 rs = chomp_rs(argc, argv);
10199 return rb_str_chomp_string(str, rs);
10212rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10214 VALUE rs = chomp_rs(argc, argv);
10220lstrip_offset(
VALUE str,
const char *s,
const char *e, rb_encoding *enc)
10222 const char *
const start = s;
10224 if (!s || s >= e)
return 0;
10227 if (single_byte_optimizable(str)) {
10228 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10233 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10253rb_str_lstrip_bang(
VALUE str)
10257 long olen, loffset;
10259 str_modify_keep_cr(str);
10260 enc = STR_ENC_GET(str);
10262 loffset = lstrip_offset(str, start, start+olen, enc);
10264 long len = olen-loffset;
10265 s = start + loffset;
10266 memmove(start, s,
len);
10267 STR_SET_LEN(str,
len);
10268 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10291rb_str_lstrip(
VALUE str)
10296 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10297 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10302rstrip_offset(
VALUE str,
const char *s,
const char *e, rb_encoding *enc)
10306 rb_str_check_dummy_enc(enc);
10310 if (!s || s >= e)
return 0;
10314 if (single_byte_optimizable(str)) {
10316 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10321 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10341rb_str_rstrip_bang(
VALUE str)
10345 long olen, roffset;
10347 str_modify_keep_cr(str);
10348 enc = STR_ENC_GET(str);
10350 roffset = rstrip_offset(str, start, start+olen, enc);
10352 long len = olen - roffset;
10354 STR_SET_LEN(str,
len);
10355 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10378rb_str_rstrip(
VALUE str)
10382 long olen, roffset;
10384 enc = STR_ENC_GET(str);
10386 roffset = rstrip_offset(str, start, start+olen, enc);
10388 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10404rb_str_strip_bang(
VALUE str)
10407 long olen, loffset, roffset;
10410 str_modify_keep_cr(str);
10411 enc = STR_ENC_GET(str);
10413 loffset = lstrip_offset(str, start, start+olen, enc);
10414 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10416 if (loffset > 0 || roffset > 0) {
10417 long len = olen-roffset;
10420 memmove(start, start + loffset,
len);
10422 STR_SET_LEN(str,
len);
10423 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10446rb_str_strip(
VALUE str)
10449 long olen, loffset, roffset;
10450 rb_encoding *enc = STR_ENC_GET(str);
10453 loffset = lstrip_offset(str, start, start+olen, enc);
10454 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10456 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10461scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10464 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10470 end = pos + RSTRING_LEN(pat);
10480 rb_encoding *enc = STR_ENC_GET(str);
10484 if (RSTRING_LEN(str) > end)
10485 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10494 if (!regs || regs->num_regs == 1) {
10500 for (
int i = 1; i < regs->num_regs; i++) {
10506 rb_ary_push(result, s);
10561 long last = -1, prev = 0;
10562 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10564 pat = get_pat_quoted(pat, 1);
10565 mustnot_broken(str);
10567 VALUE ary = rb_ary_new();
10569 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10572 rb_ary_push(ary, result);
10574 if (last >= 0) rb_pat_search(pat, str, last, 1);
10579 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10583 str_mod_check(str, p,
len);
10585 if (last >= 0) rb_pat_search(pat, str, last, 1);
10609rb_str_hex(
VALUE str)
10611 return rb_str_to_inum(str, 16, FALSE);
10636rb_str_oct(
VALUE str)
10638 return rb_str_to_inum(str, -8, FALSE);
10641#ifndef HAVE_CRYPT_R
10646 rb_nativethread_lock_t lock;
10647} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10650crypt_mutex_initialize(
void)
10721# define CRYPT_END() ALLOCV_END(databuf)
10723 extern char *crypt(
const char *,
const char *);
10724# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10727 const char *s, *saltp;
10730 char salt_8bit_clean[3];
10734 mustnot_wchar(str);
10735 mustnot_wchar(salt);
10737 saltp = RSTRING_PTR(salt);
10738 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10739 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10743 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10744 salt_8bit_clean[0] = saltp[0] & 0x7f;
10745 salt_8bit_clean[1] = saltp[1] & 0x7f;
10746 salt_8bit_clean[2] =
'\0';
10747 saltp = salt_8bit_clean;
10752# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10753 data->initialized = 0;
10755 res = crypt_r(s, saltp, data);
10757 crypt_mutex_initialize();
10759 res = crypt(s, saltp);
10800 char *ptr, *p, *pend;
10803 unsigned long sum0 = 0;
10808 ptr = p = RSTRING_PTR(str);
10809 len = RSTRING_LEN(str);
10815 str_mod_check(str, ptr,
len);
10818 sum0 += (
unsigned char)*p;
10829 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10830 sum0 &= (((
unsigned long)1)<<bits)-1;
10850rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10854 long width,
len, flen = 1, fclen = 1;
10857 const char *f =
" ";
10858 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10860 int singlebyte = 1, cr;
10864 enc = STR_ENC_GET(str);
10865 termlen = rb_enc_mbminlen(enc);
10869 enc = rb_enc_check(str, pad);
10870 f = RSTRING_PTR(pad);
10871 flen = RSTRING_LEN(pad);
10872 fclen = str_strlen(pad, enc);
10873 singlebyte = single_byte_optimizable(pad);
10874 if (flen == 0 || fclen == 0) {
10875 rb_raise(rb_eArgError,
"zero width padding");
10878 len = str_strlen(str, enc);
10879 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10881 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10885 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10886 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10888 size = RSTRING_LEN(str);
10889 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10890 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10891 (
len += llen2 + rlen2) >= LONG_MAX - size) {
10892 rb_raise(rb_eArgError,
"argument too big");
10896 p = RSTRING_PTR(res);
10898 memset(p, *f, llen);
10902 while (llen >= fclen) {
10908 memcpy(p, f, llen2);
10912 memcpy(p, RSTRING_PTR(str), size);
10915 memset(p, *f, rlen);
10919 while (rlen >= fclen) {
10925 memcpy(p, f, rlen2);
10929 TERM_FILL(p, termlen);
10930 STR_SET_LEN(res, p-RSTRING_PTR(res));
10953rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
10955 return rb_str_justify(argc, argv, str,
'l');
10969rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
10971 return rb_str_justify(argc, argv, str,
'r');
10986rb_str_center(
int argc,
VALUE *argv,
VALUE str)
10988 return rb_str_justify(argc, argv, str,
'c');
11004 sep = get_pat_quoted(sep, 0);
11016 pos = rb_str_index(str, sep, 0);
11017 if (pos < 0)
goto failed;
11022 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11025 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11039 long pos = RSTRING_LEN(str);
11041 sep = get_pat_quoted(sep, 0);
11054 pos = rb_str_rindex(str, sep, pos);
11063 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11065 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11077rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11081 for (i=0; i<argc; i++) {
11082 VALUE tmp = argv[i];
11084 if (rb_reg_start_with_p(tmp, str))
11088 const char *p, *s, *e;
11093 enc = rb_enc_check(str, tmp);
11094 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11095 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11096 p = RSTRING_PTR(str);
11099 if (!at_char_right_boundary(p, s, e, enc))
11101 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11117rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11121 for (i=0; i<argc; i++) {
11122 VALUE tmp = argv[i];
11123 const char *p, *s, *e;
11128 enc = rb_enc_check(str, tmp);
11129 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11130 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11131 p = RSTRING_PTR(str);
11134 if (!at_char_boundary(p, s, e, enc))
11136 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11152deleted_prefix_length(
VALUE str,
VALUE prefix)
11154 const char *strptr, *prefixptr;
11155 long olen, prefixlen;
11156 rb_encoding *enc = rb_enc_get(str);
11160 if (!is_broken_string(prefix) ||
11161 !rb_enc_asciicompat(enc) ||
11162 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11163 enc = rb_enc_check(str, prefix);
11167 prefixlen = RSTRING_LEN(prefix);
11168 if (prefixlen <= 0)
return 0;
11169 olen = RSTRING_LEN(str);
11170 if (olen < prefixlen)
return 0;
11171 strptr = RSTRING_PTR(str);
11172 prefixptr = RSTRING_PTR(prefix);
11173 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11174 if (is_broken_string(prefix)) {
11175 if (!is_broken_string(str)) {
11179 const char *strend = strptr + olen;
11180 const char *after_prefix = strptr + prefixlen;
11181 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11201rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11204 str_modify_keep_cr(str);
11206 prefixlen = deleted_prefix_length(str, prefix);
11207 if (prefixlen <= 0)
return Qnil;
11221rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11225 prefixlen = deleted_prefix_length(str, prefix);
11226 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11228 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11241deleted_suffix_length(
VALUE str,
VALUE suffix)
11243 const char *strptr, *suffixptr;
11244 long olen, suffixlen;
11248 if (is_broken_string(suffix))
return 0;
11249 enc = rb_enc_check(str, suffix);
11252 suffixlen = RSTRING_LEN(suffix);
11253 if (suffixlen <= 0)
return 0;
11254 olen = RSTRING_LEN(str);
11255 if (olen < suffixlen)
return 0;
11256 strptr = RSTRING_PTR(str);
11257 suffixptr = RSTRING_PTR(suffix);
11258 const char *strend = strptr + olen;
11259 const char *before_suffix = strend - suffixlen;
11260 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11261 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11276rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11278 long olen, suffixlen,
len;
11279 str_modifiable(str);
11281 suffixlen = deleted_suffix_length(str, suffix);
11282 if (suffixlen <= 0)
return Qnil;
11284 olen = RSTRING_LEN(str);
11285 str_modify_keep_cr(str);
11286 len = olen - suffixlen;
11287 STR_SET_LEN(str,
len);
11288 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11304rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11308 suffixlen = deleted_suffix_length(str, suffix);
11309 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11311 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11318 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11326 val = rb_fs_check(val);
11329 "value of %"PRIsVALUE
" must be String or Regexp",
11333 rb_warn_deprecated(
"'$;'", NULL);
11350 str_modifiable(str);
11352 rb_encoding *encoding = rb_to_encoding(enc);
11353 int idx = rb_enc_to_index(encoding);
11360 rb_enc_associate_index(str, idx);
11384 if (STR_EMBED_P(str)) {
11385 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11390 str_replace_shared_without_enc(str2, str);
11392 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11425rb_str_valid_encoding_p(
VALUE str)
11445rb_str_is_ascii_only_p(
VALUE str)
11455 static const char ellipsis[] =
"...";
11456 const long ellipsislen =
sizeof(ellipsis) - 1;
11457 rb_encoding *
const enc = rb_enc_get(str);
11458 const long blen = RSTRING_LEN(str);
11459 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11460 VALUE estr, ret = 0;
11463 if (
len * rb_enc_mbminlen(enc) >= blen ||
11467 else if (
len <= ellipsislen ||
11469 if (rb_enc_asciicompat(enc)) {
11471 rb_enc_associate(ret, enc);
11478 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11483 rb_enc_from_encoding(enc), 0,
Qnil);
11490str_compat_and_valid(
VALUE str, rb_encoding *enc)
11496 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11499 rb_encoding *e = STR_ENC_GET(str);
11502 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11508static VALUE enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl,
int cr);
11513 rb_encoding *enc = STR_ENC_GET(str);
11518rb_enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl)
11521 if (enc == STR_ENC_GET(str)) {
11526 return enc_str_scrub(enc, str, repl, cr);
11530enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl,
int cr)
11534 const char *rep, *p, *e, *p1, *sp;
11540 rb_raise(rb_eArgError,
"both of block and replacement given");
11547 if (!
NIL_P(repl)) {
11548 repl = str_compat_and_valid(repl, enc);
11551 if (rb_enc_dummy_p(enc)) {
11554 encidx = rb_enc_to_index(enc);
11556#define DEFAULT_REPLACE_CHAR(str) do { \
11557 static const char replace[sizeof(str)-1] = str; \
11558 rep = replace; replen = (int)sizeof(replace); \
11561 slen = RSTRING_LEN(str);
11562 p = RSTRING_PTR(str);
11567 if (rb_enc_asciicompat(enc)) {
11573 else if (!
NIL_P(repl)) {
11574 rep = RSTRING_PTR(repl);
11575 replen = RSTRING_LEN(repl);
11578 else if (encidx == rb_utf8_encindex()) {
11579 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11583 DEFAULT_REPLACE_CHAR(
"?");
11588 p = search_nonascii(p, e);
11593 int ret = rb_enc_precise_mbclen(p, e, enc);
11612 if (e - p < clen) clen = e - p;
11619 for (; clen > 1; clen--) {
11620 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11631 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11632 str_mod_check(str, sp, slen);
11633 repl = str_compat_and_valid(repl, enc);
11640 p = search_nonascii(p, e);
11666 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11667 str_mod_check(str, sp, slen);
11668 repl = str_compat_and_valid(repl, enc);
11677 long mbminlen = rb_enc_mbminlen(enc);
11681 else if (!
NIL_P(repl)) {
11682 rep = RSTRING_PTR(repl);
11683 replen = RSTRING_LEN(repl);
11685 else if (encidx == ENCINDEX_UTF_16BE) {
11686 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11688 else if (encidx == ENCINDEX_UTF_16LE) {
11689 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11691 else if (encidx == ENCINDEX_UTF_32BE) {
11692 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11694 else if (encidx == ENCINDEX_UTF_32LE) {
11695 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11698 DEFAULT_REPLACE_CHAR(
"?");
11702 int ret = rb_enc_precise_mbclen(p, e, enc);
11715 if (e - p < clen) clen = e - p;
11716 if (clen <= mbminlen * 2) {
11721 for (; clen > mbminlen; clen-=mbminlen) {
11722 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11732 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11733 str_mod_check(str, sp, slen);
11734 repl = str_compat_and_valid(repl, enc);
11759 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11760 str_mod_check(str, sp, slen);
11761 repl = str_compat_and_valid(repl, enc);
11797str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11805static ID id_normalize;
11806static ID id_normalized_p;
11807static VALUE mUnicodeNormalize;
11810unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11812 static int UnicodeNormalizeRequired = 0;
11815 if (!UnicodeNormalizeRequired) {
11816 rb_require(
"unicode_normalize/normalize.rb");
11817 UnicodeNormalizeRequired = 1;
11821 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11858rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11860 return unicode_normalize_common(argc, argv, str, id_normalize);
11874rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11876 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11903rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
11905 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12037#define sym_equal rb_obj_equal
12040sym_printable(
const char *s,
const char *send, rb_encoding *enc)
12044 int c = rb_enc_precise_mbclen(s, send, enc);
12048 c = rb_enc_mbc_to_codepoint(s, send, enc);
12056rb_str_symname_p(
VALUE sym)
12061 rb_encoding *resenc = rb_default_internal_encoding();
12063 if (resenc == NULL) resenc = rb_default_external_encoding();
12064 enc = STR_ENC_GET(sym);
12065 ptr = RSTRING_PTR(sym);
12066 len = RSTRING_LEN(sym);
12067 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12075rb_str_quote_unprintable(
VALUE str)
12080 rb_encoding *resenc;
12083 resenc = rb_default_internal_encoding();
12084 if (resenc == NULL) resenc = rb_default_external_encoding();
12085 enc = STR_ENC_GET(str);
12086 ptr = RSTRING_PTR(str);
12087 len = RSTRING_LEN(str);
12088 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12089 !sym_printable(ptr, ptr +
len, enc)) {
12090 return rb_str_escape(str);
12096rb_id_quote_unprintable(
ID id)
12098 VALUE str = rb_id2str(
id);
12099 if (!rb_str_symname_p(str)) {
12100 return rb_str_escape(str);
12118sym_inspect(
VALUE sym)
12125 if (!rb_str_symname_p(str)) {
12127 len = RSTRING_LEN(str);
12128 rb_str_resize(str,
len + 1);
12129 dest = RSTRING_PTR(str);
12130 memmove(dest + 1, dest,
len);
12133 rb_encoding *enc = STR_ENC_GET(str);
12134 VALUE orig_str = str;
12136 len = RSTRING_LEN(orig_str);
12137 str = rb_enc_str_new(0,
len + 1, enc);
12140 ptr = RSTRING_PTR(orig_str);
12141 dest = RSTRING_PTR(str);
12142 memcpy(dest + 1, ptr,
len);
12162rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12167 rb_raise(rb_eArgError,
"no receiver given");
12264 return rb_str_match(
rb_sym2str(sym), other);
12279sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12281 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12294sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12296 return rb_str_match_m_p(argc, argv, sym);
12314 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12325sym_length(
VALUE sym)
12339sym_empty(
VALUE sym)
12373sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12389sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12405sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12419sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12421 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12434sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12436 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12448sym_encoding(
VALUE sym)
12454string_for_symbol(
VALUE name)
12459 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12473 name = string_for_symbol(name);
12474 return rb_intern_str(name);
12483 name = string_for_symbol(name);
12507 return rb_fstring(str);
12514 return register_fstring(setup_fake_str(&fake_str, ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12526 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12527 rb_enc_autoload(enc);
12531 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
true,
false);
12535rb_enc_literal_str(
const char *ptr,
long len, rb_encoding *enc)
12537 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12538 rb_enc_autoload(enc);
12542 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
true,
true);
12553rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12558 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12559 rb_str_buf_cat_byte(str, (
char) code);
12573 st_foreach(rb_vm_fstring_table(), fstring_set_class_i,
rb_cString);
12739 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.