14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
49#include "ruby_assert.h"
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
67#undef rb_usascii_str_new
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
150str_encindex_fastpath(
int encindex)
154 case ENCINDEX_ASCII_8BIT:
156 case ENCINDEX_US_ASCII:
164str_enc_fastpath(
VALUE str)
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
218#define STR_ENC_GET(str) get_encoding(str)
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
231str_embed_capa(
VALUE str)
233 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
237rb_str_reembeddable_p(
VALUE str)
239 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
243rb_str_embed_size(
long capa,
long termlen)
251rb_str_size_as_embedded(
VALUE str)
254 if (STR_EMBED_P(str)) {
256 if (
FL_TEST_RAW(str, STR_PRECOMPUTED_HASH))
capa +=
sizeof(st_index_t);
258 real_size = rb_str_embed_size(
capa, TERM_LEN(str));
262 else if (rb_str_reembeddable_p(str)) {
264 if (
FL_TEST_RAW(str, STR_PRECOMPUTED_HASH))
capa +=
sizeof(st_index_t);
266 real_size = rb_str_embed_size(
capa, TERM_LEN(str));
269 real_size =
sizeof(
struct RString);
276STR_EMBEDDABLE_P(
long len,
long termlen)
278 return rb_gc_size_allocatable_p(rb_str_embed_size(
len, termlen));
283static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
284static VALUE str_new_static(
VALUE klass,
const char *ptr,
long len,
int encindex);
286static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
287static inline void str_modifiable(
VALUE str);
292str_make_independent(
VALUE str)
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str),
len, 0L, termlen);
299static inline int str_dependent_p(
VALUE str);
302rb_str_make_independent(
VALUE str)
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
310rb_str_make_embedded(
VALUE str)
315 char *buf =
RSTRING(str)->as.heap.ptr;
319 STR_SET_LEN(str,
len);
326 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
330rb_debug_rstring_null_ptr(
const char *func)
332 fprintf(stderr,
"%s is returning NULL!! "
333 "SIGSEGV is highly expected to follow immediately.\n"
334 "If you could reproduce, attach your debugger here, "
335 "and look at the passed string.\n",
340static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
343get_encoding(
VALUE str)
349mustnot_broken(
VALUE str)
351 if (is_broken_string(str)) {
352 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
357mustnot_wchar(
VALUE str)
359 rb_encoding *enc = STR_ENC_GET(str);
360 if (rb_enc_mbminlen(enc) > 1) {
361 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
365static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
367#if SIZEOF_LONG == SIZEOF_VOIDP
368#define PRECOMPUTED_FAKESTR_HASH 1
373BARE_STRING_P(
VALUE str)
378static inline st_index_t
379str_do_hash(
VALUE str)
383 if (e && !is_ascii_string(str)) {
390str_store_precomputed_hash(
VALUE str, st_index_t hash)
396 size_t used_bytes = (
RSTRING_LEN(str) + TERM_LEN(str));
397 size_t free_bytes = str_embed_capa(str) - used_bytes;
401 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
403 FL_SET(str, STR_PRECOMPUTED_HASH);
416 if (
FL_TEST(str, RSTRING_FSTR))
419 bare = BARE_STRING_P(str);
421 if (STR_EMBED_P(str)) {
426 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
435 fstr = register_fstring(str,
false,
false);
438 str_replace_shared_without_enc(str, fstr);
445static VALUE fstring_table_obj;
448fstring_concurrent_set_hash(
VALUE str)
450#ifdef PRECOMPUTED_FAKESTR_HASH
454 h = (st_index_t)
RSTRING(str)->as.heap.aux.capa;
471 const char *aptr, *bptr;
478 return (alen == blen &&
480 memcmp(aptr, bptr, alen) == 0);
485 bool force_precompute_hash;
489fstring_concurrent_set_create(
VALUE str,
void *data)
499 long len = RSTRING_LEN(str);
500 long capa =
len +
sizeof(st_index_t);
501 int term_len = TERM_LEN(str);
503 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
505 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
506 STR_SET_LEN(new_str, RSTRING_LEN(str));
508 rb_enc_copy(new_str, str);
509 str_store_precomputed_hash(new_str, str_do_hash(str));
513 rb_enc_copy(new_str, str);
514#ifdef PRECOMPUTED_FAKESTR_HASH
515 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
516 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
530 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
533 if (STR_SHARED_P(str)) {
535 str_make_independent(str);
538 if (!BARE_STRING_P(str)) {
544 RBASIC(str)->flags |= RSTRING_FSTR;
546 RB_OBJ_SET_SHAREABLE(str);
560 .hash = fstring_concurrent_set_hash,
561 .cmp = fstring_concurrent_set_cmp,
562 .create = fstring_concurrent_set_create,
567Init_fstring_table(
void)
569 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
570 rb_gc_register_address(&fstring_table_obj);
574register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
578 .force_precompute_hash = force_precompute_hash
581#if SIZEOF_VOIDP == SIZEOF_LONG
585 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
589 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
591 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
603rb_obj_is_fstring_table(
VALUE obj)
607 return obj == fstring_table_obj;
611rb_gc_free_fstring(
VALUE obj)
613 ASSERT_vm_locking_with_barrier();
619 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
621 RB_DEBUG_COUNTER_INC(obj_str_fstr);
627rb_fstring_foreach_with_replace(
int (*callback)(
VALUE *str,
void *data),
void *data)
629 if (fstring_table_obj) {
630 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
635setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
638 RBASIC_SET_SHAPE_ID((
VALUE)fake_str, ROOT_SHAPE_ID);
651 return (
VALUE)fake_str;
658rb_setup_fake_str(
struct RString *fake_str,
const char *name,
long len, rb_encoding *enc)
660 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
669rb_fstring_new(
const char *ptr,
long len)
671 struct RString fake_str = {RBASIC_INIT};
672 return register_fstring(setup_fake_str(&fake_str, ptr,
len, ENCINDEX_US_ASCII),
false,
false);
676rb_fstring_enc_new(
const char *ptr,
long len, rb_encoding *enc)
678 struct RString fake_str = {RBASIC_INIT};
679 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
false,
false);
683rb_fstring_cstr(
const char *ptr)
685 return rb_fstring_new(ptr, strlen(ptr));
689single_byte_optimizable(
VALUE str)
693 case ENCINDEX_ASCII_8BIT:
694 case ENCINDEX_US_ASCII:
716static inline const char *
717search_nonascii(
const char *p,
const char *e)
721#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
722# if SIZEOF_UINTPTR_T == 8
723# define NONASCII_MASK UINT64_C(0x8080808080808080)
724# elif SIZEOF_UINTPTR_T == 4
725# define NONASCII_MASK UINT32_C(0x80808080)
727# error "don't know what to do."
730# if SIZEOF_UINTPTR_T == 8
731# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
732# elif SIZEOF_UINTPTR_T == 4
733# define NONASCII_MASK 0x80808080UL
735# error "don't know what to do."
739 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
740#if !UNALIGNED_WORD_ACCESS
741 if ((uintptr_t)p % SIZEOF_VOIDP) {
742 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
747 case 7:
if (p[-7]&0x80)
return p-7;
748 case 6:
if (p[-6]&0x80)
return p-6;
749 case 5:
if (p[-5]&0x80)
return p-5;
750 case 4:
if (p[-4]&0x80)
return p-4;
752 case 3:
if (p[-3]&0x80)
return p-3;
753 case 2:
if (p[-2]&0x80)
return p-2;
754 case 1:
if (p[-1]&0x80)
return p-1;
759#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
760#define aligned_ptr(value) \
761 __builtin_assume_aligned((value), sizeof(uintptr_t))
763#define aligned_ptr(value) (value)
766 t = (e - (SIZEOF_VOIDP-1));
768 for (;s < t; s +=
sizeof(uintptr_t)) {
770 memcpy(&word, s,
sizeof(word));
771 if (word & NONASCII_MASK) {
772#ifdef WORDS_BIGENDIAN
773 return (
const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
775 return (
const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
785 case 7:
if (e[-7]&0x80)
return e-7;
786 case 6:
if (e[-6]&0x80)
return e-6;
787 case 5:
if (e[-5]&0x80)
return e-5;
788 case 4:
if (e[-4]&0x80)
return e-4;
790 case 3:
if (e[-3]&0x80)
return e-3;
791 case 2:
if (e[-2]&0x80)
return e-2;
792 case 1:
if (e[-1]&0x80)
return e-1;
798coderange_scan(
const char *p,
long len, rb_encoding *enc)
800 const char *e = p +
len;
804 p = search_nonascii(p, e);
808 if (rb_enc_asciicompat(enc)) {
809 p = search_nonascii(p, e);
812 int ret = rb_enc_precise_mbclen(p, e, enc);
816 p = search_nonascii(p, e);
822 int ret = rb_enc_precise_mbclen(p, e, enc);
841 p = search_nonascii(p, e);
845 else if (rb_enc_asciicompat(enc)) {
846 p = search_nonascii(p, e);
852 int ret = rb_enc_precise_mbclen(p, e, enc);
859 p = search_nonascii(p, e);
865 int ret = rb_enc_precise_mbclen(p, e, enc);
890 rb_enc_set_index(str1, rb_enc_get_index(str2));
898rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
903 str_enc_copy(dest, src);
905 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
916 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
928rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
930 str_enc_copy(dest, src);
935enc_coderange_scan(
VALUE str, rb_encoding *enc)
941rb_enc_str_coderange_scan(
VALUE str, rb_encoding *enc)
943 return enc_coderange_scan(str, enc);
952 cr = enc_coderange_scan(str, get_encoding(str));
959rb_enc_str_asciicompat(
VALUE str)
962 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
970 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
979str_mod_check(
VALUE s,
const char *p,
long len)
987str_capacity(
VALUE str,
const int termlen)
989 if (STR_EMBED_P(str)) {
990 return str_embed_capa(str) - termlen;
992 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
996 return RSTRING(str)->as.heap.aux.capa;
1003 return str_capacity(str, TERM_LEN(str));
1007must_not_null(
const char *ptr)
1010 rb_raise(rb_eArgError,
"NULL pointer given");
1015str_alloc_embed(
VALUE klass,
size_t capa)
1017 size_t size = rb_str_embed_size(
capa, 0);
1021 NEWOBJ_OF(str,
struct RString, klass,
1025 str->as.embed.ary[0] = 0;
1031str_alloc_heap(
VALUE klass)
1033 NEWOBJ_OF(str,
struct RString, klass,
1037 str->as.heap.aux.capa = 0;
1038 str->as.heap.ptr = NULL;
1044empty_str_alloc(
VALUE klass)
1046 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1047 VALUE str = str_alloc_embed(klass, 0);
1048 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1054str_enc_new(
VALUE klass,
const char *ptr,
long len, rb_encoding *enc)
1059 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1066 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1068 int termlen = rb_enc_mbminlen(enc);
1070 if (STR_EMBEDDABLE_P(
len, termlen)) {
1071 str = str_alloc_embed(klass,
len + termlen);
1077 str = str_alloc_heap(klass);
1083 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1086 rb_enc_raw_set(str, enc);
1095 STR_SET_LEN(str,
len);
1101str_new(
VALUE klass,
const char *ptr,
long len)
1125rb_enc_str_new(
const char *ptr,
long len, rb_encoding *enc)
1138 __msan_unpoison_string(ptr);
1158 if (rb_enc_mbminlen(enc) != 1) {
1159 rb_raise(rb_eArgError,
"wchar encoding given");
1161 return rb_enc_str_new(ptr, strlen(ptr), enc);
1165str_new_static(
VALUE klass,
const char *ptr,
long len,
int encindex)
1170 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1174 str = str_enc_new(klass, ptr,
len, rb_enc_from_index(encindex));
1177 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1178 str = str_alloc_heap(klass);
1180 RSTRING(str)->as.heap.ptr = (
char *)ptr;
1182 RBASIC(str)->flags |= STR_NOFREE;
1183 rb_enc_associate_index(str, encindex);
1197 return str_new_static(
rb_cString, ptr,
len, ENCINDEX_US_ASCII);
1203 return str_new_static(
rb_cString, ptr,
len, ENCINDEX_UTF_8);
1209 return str_new_static(
rb_cString, ptr,
len, rb_enc_to_index(enc));
1212static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1213 rb_encoding *from, rb_encoding *to,
1214 int ecflags,
VALUE ecopts);
1217is_enc_ascii_string(
VALUE str, rb_encoding *enc)
1219 int encidx = rb_enc_to_index(enc);
1220 if (rb_enc_get_index(str) == encidx)
1221 return is_ascii_string(str);
1232 if (!to)
return str;
1233 if (!from) from = rb_enc_get(str);
1234 if (from == to)
return str;
1235 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1236 rb_is_ascii8bit_enc(to)) {
1237 if (STR_ENC_GET(str) != to) {
1239 rb_enc_associate(str, to);
1246 from, to, ecflags, ecopts);
1247 if (
NIL_P(newstr)) {
1255rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1256 rb_encoding *from,
int ecflags,
VALUE ecopts)
1261 if (ofs < -olen || olen < ofs)
1263 if (ofs < 0) ofs += olen;
1265 STR_SET_LEN(newstr, ofs);
1269 rb_str_modify(newstr);
1270 return str_cat_conv_enc_opts(newstr, ofs, ptr,
len, from,
1276rb_str_initialize(
VALUE str,
const char *ptr,
long len, rb_encoding *enc)
1278 STR_SET_LEN(str, 0);
1279 rb_enc_associate(str, enc);
1285str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1286 rb_encoding *from, rb_encoding *to,
1287 int ecflags,
VALUE ecopts)
1292 VALUE econv_wrapper;
1293 const unsigned char *start, *sp;
1294 unsigned char *dest, *dp;
1295 size_t converted_output = (size_t)ofs;
1300 RBASIC_CLEAR_CLASS(econv_wrapper);
1302 if (!ec)
return Qnil;
1305 sp = (
unsigned char*)ptr;
1307 while ((dest = (
unsigned char*)
RSTRING_PTR(newstr)),
1308 (dp = dest + converted_output),
1312 size_t converted_input = sp - start;
1313 size_t rest =
len - converted_input;
1314 converted_output = dp - dest;
1316 if (converted_input && converted_output &&
1317 rest < (LONG_MAX / converted_output)) {
1318 rest = (rest * converted_output) / converted_input;
1323 olen += rest < 2 ? 2 : rest;
1324 rb_str_resize(newstr, olen);
1333 rb_enc_associate(newstr, to);
1352 const int eidx = rb_enc_to_index(eenc);
1355 return rb_enc_str_new(ptr,
len, eenc);
1365 if (!ienc || eenc == ienc) {
1366 return rb_enc_str_new(ptr,
len, eenc);
1372 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr +
len))) {
1373 return rb_enc_str_new(ptr,
len, ienc);
1376 str = rb_enc_str_new(NULL, 0, ienc);
1379 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr,
len, eenc, 0,
Qnil))) {
1380 rb_str_initialize(str, ptr,
len, eenc);
1386rb_external_str_with_enc(
VALUE str, rb_encoding *eenc)
1388 int eidx = rb_enc_to_index(eenc);
1390 !is_ascii_string(str)) {
1394 rb_enc_associate_index(str, eidx);
1453str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1455 const int termlen = TERM_LEN(str);
1460 if (str_embed_capa(str2) >=
len + termlen) {
1461 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1462 STR_SET_EMBED(str2);
1464 TERM_FILL(ptr2+
len, termlen);
1468 if (STR_SHARED_P(str)) {
1469 root =
RSTRING(str)->as.heap.aux.shared;
1478 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1480 rb_fatal(
"about to free a possible shared root");
1482 char *ptr2 = STR_HEAP_PTR(str2);
1484 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1487 FL_SET(str2, STR_NOEMBED);
1488 RSTRING(str2)->as.heap.ptr = ptr;
1489 STR_SET_SHARED(str2, root);
1492 STR_SET_LEN(str2,
len);
1500 str_replace_shared_without_enc(str2, str);
1501 rb_enc_cr_str_exact_copy(str2, str);
1508 return str_replace_shared(str_alloc_heap(klass), str);
1525rb_str_new_frozen_String(
VALUE orig)
1533rb_str_frozen_bare_string(
VALUE orig)
1535 if (RB_LIKELY(BARE_STRING_P(orig) &&
OBJ_FROZEN_RAW(orig)))
return orig;
1540rb_str_tmp_frozen_acquire(
VALUE orig)
1543 return str_new_frozen_buffer(0, orig, FALSE);
1547rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1549 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1550 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1552 VALUE str = str_alloc_heap(0);
1555 FL_SET(str, STR_SHARED_ROOT);
1557 size_t capa = str_capacity(orig, TERM_LEN(orig));
1563 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1564 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1571 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1572 RBASIC(orig)->flags &= ~STR_NOFREE;
1573 STR_SET_SHARED(orig, str);
1575 RB_OBJ_SET_SHAREABLE(str);
1587rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1592 if (STR_EMBED_P(tmp)) {
1595 else if (
FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1605 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1606 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1611 STR_SET_LEN(tmp, 0);
1619 return str_new_frozen_buffer(klass, orig, TRUE);
1629 VALUE str = str_alloc_heap(klass);
1632 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1633 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1634 RBASIC(orig)->flags &= ~STR_NOFREE;
1635 STR_SET_SHARED(orig, str);
1642str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1647 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1648 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1650 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1664 if ((ofs > 0) || (rest > 0) ||
1667 str = str_new_shared(klass,
shared);
1669 RSTRING(str)->as.heap.ptr += ofs;
1670 STR_SET_LEN(str,
RSTRING_LEN(str) - (ofs + rest));
1678 else if (STR_EMBEDDABLE_P(
RSTRING_LEN(orig), TERM_LEN(orig))) {
1679 str = str_alloc_embed(klass,
RSTRING_LEN(orig) + TERM_LEN(orig));
1691 str = heap_str_make_shared(klass, orig);
1696 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1708str_new_empty_String(
VALUE str)
1711 rb_enc_copy(v, str);
1715#define STR_BUF_MIN_SIZE 63
1720 if (STR_EMBEDDABLE_P(
capa, 1)) {
1728 RSTRING(str)->as.heap.ptr[0] =
'\0';
1737 long len = strlen(ptr);
1748 return str_new(0, 0,
len);
1754 if (STR_EMBED_P(str)) {
1755 RB_DEBUG_COUNTER_INC(obj_str_embed);
1757 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1758 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1759 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1762 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1763 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1768rb_str_memsize(
VALUE str)
1770 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1771 return STR_HEAP_SIZE(str);
1781 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1784static inline void str_discard(
VALUE str);
1785static void str_shared_replace(
VALUE str,
VALUE str2);
1790 if (str != str2) str_shared_replace(str, str2);
1801 enc = STR_ENC_GET(str2);
1804 termlen = rb_enc_mbminlen(enc);
1808 if (str_embed_capa(str) >=
RSTRING_LEN(str2) + termlen) {
1811 rb_enc_associate(str, enc);
1815 if (STR_EMBED_P(str2)) {
1820 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1821 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1822 RSTRING(str2)->as.heap.ptr = new_ptr;
1823 STR_SET_LEN(str2,
len);
1825 STR_SET_NOEMBED(str2);
1828 STR_SET_NOEMBED(str);
1832 if (
FL_TEST(str2, STR_SHARED)) {
1834 STR_SET_SHARED(str,
shared);
1837 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1841 STR_SET_EMBED(str2);
1843 STR_SET_LEN(str2, 0);
1844 rb_enc_associate(str, enc);
1858 return rb_obj_as_string_result(str, obj);
1875 if (STR_SHARED_P(str2)) {
1878 STR_SET_NOEMBED(str);
1879 STR_SET_LEN(str,
len);
1881 STR_SET_SHARED(str,
shared);
1882 rb_enc_cr_str_exact_copy(str, str2);
1885 str_replace_shared(str, str2);
1894 size_t size = rb_str_embed_size(
capa, 0);
1898 NEWOBJ_OF(str,
struct RString, klass,
1909 NEWOBJ_OF(str,
struct RString, klass,
1912 str->as.heap.aux.capa = 0;
1913 str->as.heap.ptr = NULL;
1923 encidx = rb_enc_get_index(str);
1927 if (encidx) rb_enc_associate_index(dup, encidx);
1943 return str_duplicate_setup_encoding(str, dup, flags);
1952 root =
RSTRING(str)->as.heap.aux.shared;
1955 root = str = str_new_frozen(klass, str);
1962 FL_SET(root, STR_SHARED_ROOT);
1964 flags |= RSTRING_NOEMBED | STR_SHARED;
1967 return str_duplicate_setup_encoding(str, dup, flags);
1973 if (STR_EMBED_P(str)) {
1974 return str_duplicate_setup_embed(klass, str, dup);
1977 return str_duplicate_setup_heap(klass, str, dup);
1985 if (STR_EMBED_P(str)) {
1986 dup = str_alloc_embed(klass,
RSTRING_LEN(str) + TERM_LEN(str));
1989 dup = str_alloc_heap(klass);
1992 return str_duplicate_setup(klass, str, dup);
2003rb_str_dup_m(
VALUE str)
2005 if (LIKELY(BARE_STRING_P(str))) {
2016 RUBY_DTRACE_CREATE_HOOK(STRING,
RSTRING_LEN(str));
2023 RUBY_DTRACE_CREATE_HOOK(STRING,
RSTRING_LEN(str));
2027 new_str = ec_str_alloc_embed(ec, klass,
RSTRING_LEN(str) + TERM_LEN(str));
2028 str_duplicate_setup_embed(klass, str, new_str);
2031 new_str = ec_str_alloc_heap(ec, klass);
2032 str_duplicate_setup_heap(klass, str, new_str);
2041rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
2043 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
2067 static ID keyword_ids[2];
2068 VALUE orig, opt, venc, vcapa;
2070 rb_encoding *enc = 0;
2073 if (!keyword_ids[0]) {
2074 keyword_ids[0] = rb_id_encoding();
2075 CONST_ID(keyword_ids[1],
"capacity");
2083 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
2084 enc = rb_to_encoding(venc);
2086 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2089 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2091 if (
capa < STR_BUF_MIN_SIZE) {
2092 capa = STR_BUF_MIN_SIZE;
2100 if (orig == str) n = 0;
2102 str_modifiable(str);
2103 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2105 const size_t size = (size_t)
capa + termlen;
2107 const size_t osize =
RSTRING_LEN(str) + TERM_LEN(str);
2108 char *new_ptr =
ALLOC_N(
char, size);
2109 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2110 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2112 RSTRING(str)->as.heap.ptr = new_ptr;
2114 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2115 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2116 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2118 STR_SET_LEN(str,
len);
2121 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2122 rb_enc_cr_str_exact_copy(str, orig);
2124 FL_SET(str, STR_NOEMBED);
2131 rb_enc_associate(str, enc);
2143rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2149 static ID keyword_ids[2];
2152 rb_encoding *enc = NULL;
2159 keyword_ids[0] = rb_id_encoding();
2160 CONST_ID(keyword_ids[1],
"capacity");
2162 encoding = kwargs[0];
2163 capacity = kwargs[1];
2172 if (UNDEF_P(encoding)) {
2174 encoding = rb_obj_encoding(orig);
2178 if (!UNDEF_P(encoding)) {
2179 enc = rb_to_encoding(encoding);
2183 if (UNDEF_P(capacity)) {
2185 VALUE empty_str = str_new(klass,
"", 0);
2187 rb_enc_associate(empty_str, enc);
2191 VALUE copy = str_duplicate(klass, orig);
2192 rb_enc_associate(copy, enc);
2205 if (orig_capa >
capa) {
2210 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2211 STR_SET_LEN(str, 0);
2222#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2237static inline uintptr_t
2238count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2243 d = (d>>6) | (~d>>7);
2244 d &= NONASCII_MASK >> 7;
2247#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2249 return rb_popcount_intptr(d);
2253# if SIZEOF_VOIDP == 8
2262enc_strlen(
const char *p,
const char *e, rb_encoding *enc,
int cr)
2268 long diff = (long)(e - p);
2269 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2274 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2275 const uintptr_t *s, *t;
2276 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2277 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2278 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2279 while (p < (
const char *)s) {
2280 if (is_utf8_lead_byte(*p))
len++;
2284 len += count_utf8_lead_bytes_with_word(s);
2287 p = (
const char *)s;
2290 if (is_utf8_lead_byte(*p))
len++;
2296 else if (rb_enc_asciicompat(enc)) {
2301 q = search_nonascii(p, e);
2307 p += rb_enc_fast_mbclen(p, e, enc);
2314 q = search_nonascii(p, e);
2320 p += rb_enc_mbclen(p, e, enc);
2327 for (c=0; p<e; c++) {
2328 p += rb_enc_mbclen(p, e, enc);
2343rb_enc_strlen_cr(
const char *p,
const char *e, rb_encoding *enc,
int *cr)
2351 long diff = (long)(e - p);
2352 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2354 else if (rb_enc_asciicompat(enc)) {
2358 q = search_nonascii(p, e);
2366 ret = rb_enc_precise_mbclen(p, e, enc);
2381 for (c=0; p<e; c++) {
2382 ret = rb_enc_precise_mbclen(p, e, enc);
2389 if (p + rb_enc_mbminlen(enc) <= e)
2390 p += rb_enc_mbminlen(enc);
2401str_strlen(
VALUE str, rb_encoding *enc)
2406 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2407 if (!enc) enc = STR_ENC_GET(str);
2413 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2418 return enc_strlen(p, e, enc, cr);
2425 return str_strlen(str, NULL);
2439 return LONG2NUM(str_strlen(str, NULL));
2451rb_str_bytesize(
VALUE str)
2470rb_str_empty(
VALUE str)
2491 char *ptr1, *ptr2, *ptr3;
2496 enc = rb_enc_check_str(str1, str2);
2499 termlen = rb_enc_mbminlen(enc);
2500 if (len1 > LONG_MAX - len2) {
2501 rb_raise(rb_eArgError,
"string size too big");
2503 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2505 memcpy(ptr3, ptr1, len1);
2506 memcpy(ptr3+len1, ptr2, len2);
2507 TERM_FILL(&ptr3[len1+len2], termlen);
2523 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2526 int enc1 = rb_enc_get_index(str1);
2527 int enc2 = rb_enc_get_index(str2);
2532 else if (enc2 < 0) {
2535 else if (enc1 != enc2) {
2538 else if (len1 > LONG_MAX - len2) {
2572 rb_enc_copy(str2, str);
2577 rb_raise(rb_eArgError,
"negative argument");
2580 if (STR_EMBEDDABLE_P(
len, 1)) {
2589 STR_SET_LEN(str2,
len);
2590 rb_enc_copy(str2, str);
2594 rb_raise(rb_eArgError,
"argument too big");
2598 termlen = TERM_LEN(str);
2604 while (n <=
len/2) {
2605 memcpy(ptr2 + n, ptr2, n);
2608 memcpy(ptr2 + n, ptr2,
len-n);
2610 STR_SET_LEN(str2,
len);
2611 TERM_FILL(&ptr2[
len], termlen);
2612 rb_enc_cr_str_copy_for_substr(str2, str);
2651rb_check_lockedtmp(
VALUE str)
2653 if (
FL_TEST(str, STR_TMPLOCK)) {
2660#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2662str_modifiable(
VALUE str)
2666 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2667 if (CHILLED_STRING_P(str)) {
2668 CHILLED_STRING_MUTATED(str);
2670 rb_check_lockedtmp(str);
2671 rb_check_frozen(str);
2676str_dependent_p(
VALUE str)
2678 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2688#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2690str_independent(
VALUE str)
2694 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2695 str_modifiable(str);
2696 return !str_dependent_p(str);
2702str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2712 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2713 ptr =
RSTRING(str)->as.heap.ptr;
2717 STR_SET_LEN(str,
len);
2724 memcpy(ptr, oldptr,
len);
2726 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2729 STR_SET_NOEMBED(str);
2730 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2731 TERM_FILL(ptr +
len, termlen);
2732 RSTRING(str)->as.heap.ptr = ptr;
2733 STR_SET_LEN(str,
len);
2740 if (!str_independent(str))
2741 str_make_independent(str);
2750 int termlen = TERM_LEN(str);
2754 rb_raise(rb_eArgError,
"negative expanding string size");
2756 if (expand >= LONG_MAX -
len) {
2757 rb_raise(rb_eArgError,
"string size too big");
2760 if (!str_independent(str)) {
2761 str_make_independent_expand(str,
len, expand, termlen);
2763 else if (expand > 0) {
2764 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2771str_modify_keep_cr(
VALUE str)
2773 if (!str_independent(str))
2774 str_make_independent(str);
2781str_discard(
VALUE str)
2783 str_modifiable(str);
2784 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2785 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2786 RSTRING(str)->as.heap.ptr = 0;
2787 STR_SET_LEN(str, 0);
2794 int encindex = rb_enc_get_index(str);
2796 if (RB_UNLIKELY(encindex == -1)) {
2800 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2804 rb_encoding *enc = rb_enc_from_index(encindex);
2805 if (!rb_enc_asciicompat(enc)) {
2831zero_filled(
const char *s,
int n)
2833 for (; n > 0; --n) {
2840str_null_char(
const char *s,
long len,
const int minlen, rb_encoding *enc)
2842 const char *e = s +
len;
2844 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2845 if (zero_filled(s, minlen))
return s;
2851str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2856 if (str_dependent_p(str)) {
2857 if (!zero_filled(s +
len, termlen))
2858 str_make_independent_expand(str,
len, 0L, termlen);
2861 TERM_FILL(s +
len, termlen);
2868rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2870 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2875 rb_check_lockedtmp(str);
2876 str_make_independent_expand(str,
len, 0L, termlen);
2878 else if (str_dependent_p(str)) {
2879 if (termlen > oldtermlen)
2880 str_make_independent_expand(str,
len, 0L, termlen);
2883 if (!STR_EMBED_P(str)) {
2888 if (termlen > oldtermlen) {
2897str_null_check(
VALUE str,
int *w)
2901 rb_encoding *enc = rb_enc_get(str);
2902 const int minlen = rb_enc_mbminlen(enc);
2906 if (str_null_char(s,
len, minlen, enc)) {
2909 return str_fill_term(str, s,
len, minlen);
2912 if (!s || memchr(s, 0,
len)) {
2916 s = str_fill_term(str, s,
len, minlen);
2922rb_str_to_cstr(
VALUE str)
2925 return str_null_check(str, &w);
2933 char *s = str_null_check(str, &w);
2936 rb_raise(rb_eArgError,
"string contains null char");
2938 rb_raise(rb_eArgError,
"string contains null byte");
2944rb_str_fill_terminator(
VALUE str,
const int newminlen)
2948 return str_fill_term(str, s,
len, newminlen);
2954 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2980str_nth_len(
const char *p,
const char *e,
long *nthp, rb_encoding *enc)
2989 else if (rb_enc_asciicompat(enc)) {
2990 const char *p2, *e2;
2993 while (p < e && 0 < nth) {
3000 p2 = search_nonascii(p, e2);
3009 n = rb_enc_mbclen(p, e, enc);
3020 while (p < e && nth--) {
3021 p += rb_enc_mbclen(p, e, enc);
3030rb_enc_nth(
const char *p,
const char *e,
long nth, rb_encoding *enc)
3032 return str_nth_len(p, e, &nth, enc);
3036str_nth(
const char *p,
const char *e,
long nth, rb_encoding *enc,
int singlebyte)
3041 p = str_nth_len(p, e, &nth, enc);
3050str_offset(
const char *p,
const char *e,
long nth, rb_encoding *enc,
int singlebyte)
3052 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3053 if (!pp)
return e - p;
3061 STR_ENC_GET(str), single_byte_optimizable(str));
3066str_utf8_nth(
const char *p,
const char *e,
long *nthp)
3069 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
3070 const uintptr_t *s, *t;
3071 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3072 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3073 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
3074 while (p < (
const char *)s) {
3075 if (is_utf8_lead_byte(*p)) nth--;
3079 nth -= count_utf8_lead_bytes_with_word(s);
3081 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
3085 if (is_utf8_lead_byte(*p)) {
3086 if (nth == 0)
break;
3096str_utf8_offset(
const char *p,
const char *e,
long nth)
3098 const char *pp = str_utf8_nth(p, e, &nth);
3107 if (single_byte_optimizable(str) || pos < 0)
3111 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3116str_subseq(
VALUE str,
long beg,
long len)
3124 const int termlen = TERM_LEN(str);
3132 if (str_embed_capa(str2) >=
len + termlen) {
3133 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3134 STR_SET_EMBED(str2);
3136 TERM_FILL(ptr2+
len, termlen);
3138 STR_SET_LEN(str2,
len);
3142 str_replace_shared(str2, str);
3145 RSTRING(str2)->as.heap.ptr += beg;
3147 STR_SET_LEN(str2,
len);
3157 VALUE str2 = str_subseq(str, beg,
len);
3158 rb_enc_cr_str_copy_for_substr(str2, str);
3168 rb_encoding *enc = STR_ENC_GET(str);
3171 if (
len < 0)
return 0;
3172 if (beg < 0 && -beg < 0)
return 0;
3176 if (single_byte_optimizable(str)) {
3177 if (beg > blen)
return 0;
3180 if (beg < 0)
return 0;
3182 if (
len > blen - beg)
3184 if (
len < 0)
return 0;
3189 if (
len > -beg)
len = -beg;
3193 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3196 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3202 slen = str_strlen(str, enc);
3204 if (beg < 0)
return 0;
3206 if (
len == 0)
goto end;
3209 else if (beg > 0 && beg > blen) {
3213 if (beg > str_strlen(str, enc))
return 0;
3219 p = str_utf8_nth(s, e, &beg);
3220 if (beg > 0)
return 0;
3221 len = str_utf8_offset(p, e,
len);
3227 p = s + beg * char_sz;
3231 else if (
len * char_sz > e - p)
3236 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3237 if (beg > 0)
return 0;
3241 len = str_offset(p, e,
len, enc, 0);
3249static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3254 return str_substr(str, beg,
len, TRUE);
3264str_substr(
VALUE str,
long beg,
long len,
int empty)
3268 if (!p)
return Qnil;
3269 if (!
len && !empty)
return Qnil;
3273 VALUE str2 = str_subseq(str, beg,
len);
3274 rb_enc_cr_str_copy_for_substr(str2, str);
3282 if (CHILLED_STRING_P(str)) {
3305 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3348str_uminus(
VALUE str)
3353 return rb_fstring(str);
3357#define rb_str_dup_frozen rb_str_new_frozen
3362 rb_check_frozen(str);
3363 if (
FL_TEST(str, STR_TMPLOCK)) {
3366 FL_SET(str, STR_TMPLOCK);
3373 rb_check_frozen(str);
3374 if (!
FL_TEST(str, STR_TMPLOCK)) {
3394 const int termlen = TERM_LEN(str);
3396 str_modifiable(str);
3397 if (STR_SHARED_P(str)) {
3400 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3401 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3417 rb_encoding *enc = rb_enc_get(str);
3434 STR_SET_LEN(str,
len);
3442 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3445 int independent = str_independent(str);
3447 const int termlen = TERM_LEN(str);
3449 if (slen >
len || (termlen != 1 && slen <
len)) {
3455 if (STR_EMBED_P(str)) {
3456 if (
len == slen)
return str;
3457 if (str_embed_capa(str) >=
len + termlen) {
3458 STR_SET_LEN(str,
len);
3462 str_make_independent_expand(str, slen,
len - slen, termlen);
3464 else if (str_embed_capa(str) >=
len + termlen) {
3465 char *ptr = STR_HEAP_PTR(str);
3467 if (slen >
len) slen =
len;
3470 STR_SET_LEN(str,
len);
3471 if (independent) ruby_xfree(ptr);
3474 else if (!independent) {
3475 if (
len == slen)
return str;
3476 str_make_independent_expand(str, slen,
len - slen, termlen);
3480 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3481 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3484 else if (
len == slen)
return str;
3485 STR_SET_LEN(str,
len);
3492str_ensure_available_capa(
VALUE str,
long len)
3494 str_modify_keep_cr(str);
3496 const int termlen = TERM_LEN(str);
3499 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3500 rb_raise(rb_eArgError,
"string sizes too big");
3503 long total = olen +
len;
3504 long capa = str_capacity(str, termlen);
3507 if (total >= LONG_MAX / 2) {
3510 while (total >
capa) {
3513 RESIZE_CAPA_TERM(str,
capa, termlen);
3518str_buf_cat4(
VALUE str,
const char *ptr,
long len,
bool keep_cr)
3521 str_modify_keep_cr(str);
3526 if (
len == 0)
return 0;
3528 long total, olen,
off = -1;
3530 const int termlen = TERM_LEN(str);
3533 if (ptr >= sptr && ptr <= sptr + olen) {
3537 long capa = str_capacity(str, termlen);
3539 if (olen > LONG_MAX -
len) {
3540 rb_raise(rb_eArgError,
"string sizes too big");
3544 if (total >= LONG_MAX / 2) {
3547 while (total >
capa) {
3550 RESIZE_CAPA_TERM(str,
capa, termlen);
3556 memcpy(sptr + olen, ptr,
len);
3557 STR_SET_LEN(str, total);
3558 TERM_FILL(sptr + total, termlen);
3563#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3564#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3569 if (
len == 0)
return str;
3571 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3573 return str_buf_cat(str, ptr,
len);
3584rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3589 if (UNLIKELY(!str_independent(str))) {
3590 str_make_independent(str);
3593 long string_length = -1;
3594 const int null_terminator_length = 1;
3599 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3600 rb_raise(rb_eArgError,
"string sizes too big");
3603 long string_capacity = str_capacity(str, null_terminator_length);
3609 if (LIKELY(string_capacity >= string_length + 1)) {
3611 sptr[string_length] = byte;
3612 STR_SET_LEN(str, string_length + 1);
3613 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3617 str_buf_cat(str, (
char *)&
byte, 1);
3625 if (ISASCII(
byte)) {
3633 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3644rb_enc_cr_str_buf_cat(
VALUE str,
const char *ptr,
long len,
3645 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3650 rb_encoding *str_enc, *ptr_enc;
3654 if (str_encindex == ptr_encindex) {
3656 ptr_cr = coderange_scan(ptr,
len, rb_enc_from_index(ptr_encindex));
3660 str_enc = rb_enc_from_index(str_encindex);
3661 ptr_enc = rb_enc_from_index(ptr_encindex);
3662 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3668 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3674 ptr_cr = coderange_scan(ptr,
len, ptr_enc);
3683 *ptr_cr_ret = ptr_cr;
3685 if (str_encindex != ptr_encindex &&
3688 str_enc = rb_enc_from_index(str_encindex);
3689 ptr_enc = rb_enc_from_index(ptr_encindex);
3694 res_encindex = str_encindex;
3699 res_encindex = str_encindex;
3703 res_encindex = ptr_encindex;
3708 res_encindex = str_encindex;
3715 res_encindex = str_encindex;
3721 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3723 str_buf_cat(str, ptr,
len);
3729 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3736 return rb_enc_cr_str_buf_cat(str, ptr,
len,
3745 rb_encoding *enc = rb_enc_from_index(encindex);
3746 if (rb_enc_asciicompat(enc)) {
3747 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3753 unsigned int c = (
unsigned char)*ptr;
3754 int len = rb_enc_codelen(c, enc);
3755 rb_enc_mbcput(c, buf, enc);
3756 rb_enc_cr_str_buf_cat(str, buf,
len,
3769 if (str_enc_fastpath(str)) {
3806rb_str_concat_literals(
size_t num,
const VALUE *strary)
3810 unsigned long len = 1;
3817 str_enc_copy_direct(str, strary[0]);
3819 for (i = s; i < num; ++i) {
3820 const VALUE v = strary[i];
3824 if (encidx != ENCINDEX_US_ASCII) {
3826 rb_enc_set_index(str, encidx);
3839rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3841 str_modifiable(str);
3846 else if (argc > 1) {
3849 rb_enc_copy(arg_str, str);
3850 for (i = 0; i < argc; i++) {
3885rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3887 long needed_capacity = 0;
3891 for (
int index = 0; index < argc; index++) {
3892 VALUE obj = argv[index];
3905 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3912 str_ensure_available_capa(str, needed_capacity);
3915 for (
int index = 0; index < argc; index++) {
3916 VALUE obj = argv[index];
3921 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3922 char byte = (char)(
NUM2INT(obj) & 0xFF);
3931 memcpy(sptr, ptr,
len);
3936 rb_bug(
"append_as_bytes arguments should have been validated");
3940 STR_SET_LEN(str,
RSTRING_LEN(str) + needed_capacity);
3941 TERM_FILL(sptr, TERM_LEN(str));
3946 for (
int index = 0; index < argc; index++) {
3947 VALUE obj = argv[index];
3964 rb_bug(
"append_as_bytes arguments should have been validated");
4039 rb_encoding *enc = STR_ENC_GET(str1);
4043 if (rb_num_to_uint(str2, &code) == 0) {
4056 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4059 rb_str_buf_cat_byte(str1, (
unsigned char)code);
4067 switch (
len = rb_enc_codelen(code, enc)) {
4068 case ONIGERR_INVALID_CODE_POINT_VALUE:
4069 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4071 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4077 rb_enc_mbcput(code, buf, enc);
4078 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
4079 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4081 rb_str_resize(str1, pos+
len);
4095rb_ascii8bit_appendable_encoding_index(rb_encoding *enc,
unsigned int code)
4097 int encidx = rb_enc_to_index(enc);
4099 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4104 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4105 return ENCINDEX_ASCII_8BIT;
4127rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4129 str_modifiable(str);
4134 else if (argc > 1) {
4137 rb_enc_copy(arg_str, str);
4138 for (i = 0; i < argc; i++) {
4151 st_index_t precomputed_hash;
4152 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4154 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4155 return precomputed_hash;
4158 return str_do_hash(str);
4165 const char *ptr1, *ptr2;
4168 return (len1 != len2 ||
4170 memcmp(ptr1, ptr2, len1) != 0);
4182rb_str_hash_m(
VALUE str)
4188#define lesser(a,b) (((a)>(b))?(b):(a))
4200 if (idx1 == idx2)
return TRUE;
4205 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4209 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4219 const char *ptr1, *ptr2;
4222 if (str1 == str2)
return 0;
4225 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4234 if (len1 > len2)
return 1;
4237 if (retval > 0)
return 1;
4271 if (str1 == str2)
return Qtrue;
4278 return rb_str_eql_internal(str1, str2);
4292 if (str1 == str2)
return Qtrue;
4294 return rb_str_eql_internal(str1, str2);
4332 return rb_invcmp(str1, str2);
4374 return str_casecmp(str1, s);
4382 const char *p1, *p1end, *p2, *p2end;
4384 enc = rb_enc_compatible(str1, str2);
4391 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4392 while (p1 < p1end && p2 < p2end) {
4394 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4395 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4397 return INT2FIX(c1 < c2 ? -1 : 1);
4404 while (p1 < p1end && p2 < p2end) {
4405 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4406 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4408 if (0 <= c1 && 0 <= c2) {
4412 return INT2FIX(c1 < c2 ? -1 : 1);
4416 l1 = rb_enc_mbclen(p1, p1end, enc);
4417 l2 = rb_enc_mbclen(p2, p2end, enc);
4418 len = l1 < l2 ? l1 : l2;
4419 r = memcmp(p1, p2,
len);
4421 return INT2FIX(r < 0 ? -1 : 1);
4423 return INT2FIX(l1 < l2 ? -1 : 1);
4429 if (p1 == p1end && p2 == p2end)
return INT2FIX(0);
4430 if (p1 == p1end)
return INT2FIX(-1);
4463 return str_casecmp_p(str1, s);
4470 VALUE folded_str1, folded_str2;
4471 VALUE fold_opt = sym_fold;
4473 enc = rb_enc_compatible(str1, str2);
4478 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4479 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4481 return rb_str_eql(folded_str1, folded_str2);
4485strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4486 const char *sub_ptr,
long sub_len,
long offset, rb_encoding *enc)
4488 const char *search_start = str_ptr;
4489 long pos, search_len = str_len - offset;
4493 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4494 if (pos < 0)
return pos;
4496 if (t == search_start + pos)
break;
4497 search_len -= t - search_start;
4498 if (search_len <= 0)
return -1;
4499 offset += t - search_start;
4502 return pos + offset;
4506#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4507#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4510rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4512 const char *str_ptr, *str_ptr_end, *sub_ptr;
4513 long str_len, sub_len;
4516 enc = rb_enc_check(str, sub);
4517 if (is_broken_string(sub))
return -1;
4525 if (str_len < sub_len)
return -1;
4528 long str_len_char, sub_len_char;
4529 int single_byte = single_byte_optimizable(str);
4530 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4531 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4533 offset += str_len_char;
4534 if (offset < 0)
return -1;
4536 if (str_len_char - offset < sub_len_char)
return -1;
4537 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4540 if (sub_len == 0)
return offset;
4543 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4556rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4560 rb_encoding *enc = STR_ENC_GET(str);
4563 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4564 long slen = str_strlen(str, enc);
4566 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4579 enc, single_byte_optimizable(str));
4590 pos = rb_str_index(str, sub, pos);
4604str_ensure_byte_pos(
VALUE str,
long pos)
4606 if (!single_byte_optimizable(str)) {
4607 const char *s = RSTRING_PTR(str);
4609 const char *p = s + pos;
4610 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4612 "offset %ld does not land on character boundary", pos);
4685rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4691 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4692 long slen = RSTRING_LEN(str);
4694 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4705 str_ensure_byte_pos(str, pos);
4717 pos = rb_str_byteindex(str, sub, pos);
4718 if (pos >= 0)
return LONG2NUM(pos);
4725memrchr(
const char *search_str,
int chr,
long search_len)
4727 const char *ptr = search_str + search_len;
4728 while (ptr > search_str) {
4729 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4737str_rindex(
VALUE str,
VALUE sub,
const char *s, rb_encoding *enc)
4739 char *hit, *adjusted;
4741 long slen, searchlen;
4744 sbeg = RSTRING_PTR(str);
4745 slen = RSTRING_LEN(sub);
4746 if (slen == 0)
return s - sbeg;
4748 t = RSTRING_PTR(sub);
4750 searchlen = s - sbeg + 1;
4752 if (memcmp(s, t, slen) == 0) {
4757 hit = memrchr(sbeg, c, searchlen);
4760 if (hit != adjusted) {
4761 searchlen = adjusted - sbeg;
4764 if (memcmp(hit, t, slen) == 0)
4766 searchlen = adjusted - sbeg;
4767 }
while (searchlen > 0);
4781 enc = rb_enc_check(str, sub);
4782 if (is_broken_string(sub))
return -1;
4783 singlebyte = single_byte_optimizable(str);
4784 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4785 slen = str_strlen(sub, enc);
4788 if (
len < slen)
return -1;
4789 if (
len - pos < slen) pos =
len - slen;
4790 if (
len == 0)
return pos;
4792 sbeg = RSTRING_PTR(str);
4795 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4801 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4802 return str_rindex(str, sub, s, enc);
4814rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4818 rb_encoding *enc = STR_ENC_GET(str);
4819 long pos,
len = str_strlen(str, enc);
4821 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4823 if (pos < 0 && (pos +=
len) < 0) {
4829 if (pos >
len) pos =
len;
4837 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4838 enc, single_byte_optimizable(str));
4849 pos = rb_str_rindex(str, sub, pos);
4859rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4865 enc = rb_enc_check(str, sub);
4866 if (is_broken_string(sub))
return -1;
4867 len = RSTRING_LEN(str);
4868 slen = RSTRING_LEN(sub);
4871 if (
len < slen)
return -1;
4872 if (
len - pos < slen) pos =
len - slen;
4873 if (
len == 0)
return pos;
4875 sbeg = RSTRING_PTR(str);
4878 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4885 return str_rindex(str, sub, s, enc);
4975rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4979 long pos,
len = RSTRING_LEN(str);
4981 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4983 if (pos < 0 && (pos +=
len) < 0) {
4989 if (pos >
len) pos =
len;
4995 str_ensure_byte_pos(str, pos);
5007 pos = rb_str_byterindex(str, sub, pos);
5008 if (pos >= 0)
return LONG2NUM(pos);
5047 switch (OBJ_BUILTIN_TYPE(y)) {
5101rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
5108 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5139rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5143 re = get_pat(argv[0]);
5144 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5153static enum neighbor_char
5154enc_succ_char(
char *p,
long len, rb_encoding *enc)
5159 if (rb_enc_mbminlen(enc) > 1) {
5161 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5163 return NEIGHBOR_NOT_CHAR;
5165 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5167 if (!l)
return NEIGHBOR_NOT_CHAR;
5168 if (l !=
len)
return NEIGHBOR_WRAPPED;
5169 rb_enc_mbcput(c, p, enc);
5170 r = rb_enc_precise_mbclen(p, p +
len, enc);
5172 return NEIGHBOR_NOT_CHAR;
5174 return NEIGHBOR_FOUND;
5177 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5180 return NEIGHBOR_WRAPPED;
5181 ++((
unsigned char*)p)[i];
5182 l = rb_enc_precise_mbclen(p, p+
len, enc);
5186 return NEIGHBOR_FOUND;
5189 memset(p+l, 0xff,
len-l);
5195 for (len2 =
len-1; 0 < len2; len2--) {
5196 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5200 memset(p+len2+1, 0xff,
len-(len2+1));
5205static enum neighbor_char
5206enc_pred_char(
char *p,
long len, rb_encoding *enc)
5210 if (rb_enc_mbminlen(enc) > 1) {
5212 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5214 return NEIGHBOR_NOT_CHAR;
5216 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5217 if (!c)
return NEIGHBOR_NOT_CHAR;
5220 if (!l)
return NEIGHBOR_NOT_CHAR;
5221 if (l !=
len)
return NEIGHBOR_WRAPPED;
5222 rb_enc_mbcput(c, p, enc);
5223 r = rb_enc_precise_mbclen(p, p +
len, enc);
5225 return NEIGHBOR_NOT_CHAR;
5227 return NEIGHBOR_FOUND;
5230 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5233 return NEIGHBOR_WRAPPED;
5234 --((
unsigned char*)p)[i];
5235 l = rb_enc_precise_mbclen(p, p+
len, enc);
5239 return NEIGHBOR_FOUND;
5242 memset(p+l, 0,
len-l);
5248 for (len2 =
len-1; 0 < len2; len2--) {
5249 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5253 memset(p+len2+1, 0,
len-(len2+1));
5267static enum neighbor_char
5268enc_succ_alnum_char(
char *p,
long len, rb_encoding *enc,
char *carry)
5270 enum neighbor_char ret;
5274 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5278 const int max_gaps = 1;
5280 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5282 ctype = ONIGENC_CTYPE_DIGIT;
5284 ctype = ONIGENC_CTYPE_ALPHA;
5286 return NEIGHBOR_NOT_CHAR;
5289 for (
try = 0;
try <= max_gaps; ++
try) {
5290 ret = enc_succ_char(p,
len, enc);
5291 if (ret == NEIGHBOR_FOUND) {
5292 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5294 return NEIGHBOR_FOUND;
5301 ret = enc_pred_char(p,
len, enc);
5302 if (ret == NEIGHBOR_FOUND) {
5303 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5316 return NEIGHBOR_NOT_CHAR;
5319 if (ctype != ONIGENC_CTYPE_DIGIT) {
5321 return NEIGHBOR_WRAPPED;
5325 enc_succ_char(carry,
len, enc);
5326 return NEIGHBOR_WRAPPED;
5344 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5345 rb_enc_cr_str_copy_for_substr(str, orig);
5346 return str_succ(str);
5353 char *sbeg, *s, *e, *last_alnum = 0;
5354 int found_alnum = 0;
5356 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5357 long carry_pos = 0, carry_len = 1;
5358 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5360 slen = RSTRING_LEN(str);
5361 if (slen == 0)
return str;
5363 enc = STR_ENC_GET(str);
5364 sbeg = RSTRING_PTR(str);
5365 s = e = sbeg + slen;
5367 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5368 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5374 l = rb_enc_precise_mbclen(s, e, enc);
5375 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5376 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5377 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5379 case NEIGHBOR_NOT_CHAR:
5381 case NEIGHBOR_FOUND:
5383 case NEIGHBOR_WRAPPED:
5388 carry_pos = s - sbeg;
5393 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5394 enum neighbor_char neighbor;
5395 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5396 l = rb_enc_precise_mbclen(s, e, enc);
5397 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5398 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5400 neighbor = enc_succ_char(tmp, l, enc);
5402 case NEIGHBOR_FOUND:
5406 case NEIGHBOR_WRAPPED:
5409 case NEIGHBOR_NOT_CHAR:
5412 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5414 enc_succ_char(s, l, enc);
5416 if (!rb_enc_asciicompat(enc)) {
5417 MEMCPY(carry, s,
char, l);
5420 carry_pos = s - sbeg;
5424 RESIZE_CAPA(str, slen + carry_len);
5425 sbeg = RSTRING_PTR(str);
5426 s = sbeg + carry_pos;
5427 memmove(s + carry_len, s, slen - carry_pos);
5428 memmove(s, carry, carry_len);
5430 STR_SET_LEN(str, slen);
5431 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5447rb_str_succ_bang(
VALUE str)
5455all_digits_p(
const char *s,
long len)
5483 VALUE end, exclusive;
5487 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5493 VALUE current, after_end;
5500 enc = rb_enc_check(beg, end);
5501 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5503 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5504 char c = RSTRING_PTR(beg)[0];
5505 char e = RSTRING_PTR(end)[0];
5507 if (c > e || (excl && c == e))
return beg;
5509 VALUE str = rb_enc_str_new(&c, 1, enc);
5511 if ((*each)(str, arg))
break;
5512 if (!excl && c == e)
break;
5514 if (excl && c == e)
break;
5519 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5520 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5521 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5526 b = rb_str_to_inum(beg, 10, FALSE);
5527 e = rb_str_to_inum(end, 10, FALSE);
5534 if (excl && bi == ei)
break;
5535 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5540 ID op = excl ?
'<' : idLE;
5541 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5546 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5547 b = rb_funcallv(b, succ, 0, 0);
5554 if (n > 0 || (excl && n == 0))
return beg;
5556 after_end = rb_funcallv(end, succ, 0, 0);
5561 next = rb_funcallv(current, succ, 0, 0);
5562 if ((*each)(current, arg))
break;
5563 if (
NIL_P(next))
break;
5567 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5582 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5583 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5584 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5586 b = rb_str_to_inum(beg, 10, FALSE);
5592 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5600 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5601 b = rb_funcallv(b, succ, 0, 0);
5607 VALUE next = rb_funcallv(current, succ, 0, 0);
5608 if ((*each)(current, arg))
break;
5611 if (RSTRING_LEN(current) == 0)
5622 if (!
rb_equal(str, *argp))
return 0;
5636 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5637 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5638 rb_enc_asciicompat(STR_ENC_GET(val))) {
5639 const char *bp = RSTRING_PTR(beg);
5640 const char *ep = RSTRING_PTR(end);
5641 const char *vp = RSTRING_PTR(val);
5642 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5643 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5650 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5651 if (b <= v && v < e)
return Qtrue;
5652 return RBOOL(!
RTEST(exclusive) && v == e);
5659 all_digits_p(bp, RSTRING_LEN(beg)) &&
5660 all_digits_p(ep, RSTRING_LEN(end))) {
5665 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5667 return RBOOL(
NIL_P(val));
5690 return rb_str_subpat(str, indx,
INT2FIX(0));
5693 if (rb_str_index(str, indx, 0) != -1)
5699 long beg,
len = str_strlen(str, NULL);
5711 return str_substr(str, idx, 1, FALSE);
5728rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5732 return rb_str_subpat(str, argv[0], argv[1]);
5735 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5739 return rb_str_aref(str, argv[0]);
5745 char *ptr = RSTRING_PTR(str);
5746 long olen = RSTRING_LEN(str), nlen;
5748 str_modifiable(str);
5749 if (
len > olen)
len = olen;
5751 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5753 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5755 ptr =
RSTRING(str)->as.embed.ary;
5756 memmove(ptr, oldptr +
len, nlen);
5757 if (fl == STR_NOEMBED)
xfree(oldptr);
5760 if (!STR_SHARED_P(str)) {
5762 rb_enc_cr_str_exact_copy(shared, str);
5767 STR_SET_LEN(str, nlen);
5769 if (!SHARABLE_MIDDLE_SUBSTRING) {
5770 TERM_FILL(ptr + nlen, TERM_LEN(str));
5777rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5783 if (beg == 0 && vlen == 0) {
5788 str_modify_keep_cr(str);
5792 RESIZE_CAPA(str, slen + vlen -
len);
5793 sptr = RSTRING_PTR(str);
5802 memmove(sptr + beg + vlen,
5804 slen - (beg +
len));
5806 if (vlen < beg &&
len < 0) {
5810 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5813 STR_SET_LEN(str, slen);
5814 TERM_FILL(&sptr[slen], TERM_LEN(str));
5821 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5830 int singlebyte = single_byte_optimizable(str);
5836 enc = rb_enc_check(str, val);
5837 slen = str_strlen(str, enc);
5839 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5848 if (
len > slen - beg) {
5851 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5856 beg = p - RSTRING_PTR(str);
5858 rb_str_update_0(str, beg,
len, val);
5859 rb_enc_associate(str, enc);
5870 long start, end,
len;
5880 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5884 nth += regs->num_regs;
5894 enc = rb_enc_check_str(str, val);
5895 rb_str_update_0(str, start,
len, val);
5896 rb_enc_associate(str, enc);
5904 switch (
TYPE(indx)) {
5906 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5910 beg = rb_str_index(str, indx, 0);
5949rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5953 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5961 return rb_str_aset(str, argv[0], argv[1]);
6013rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
6021 str_modify_keep_cr(str);
6029 if ((nth += regs->num_regs) <= 0)
return Qnil;
6031 else if (nth >= regs->num_regs)
return Qnil;
6033 len = END(nth) - beg;
6036 else if (argc == 2) {
6045 beg = p - RSTRING_PTR(str);
6049 beg = rb_str_index(str, indx, 0);
6050 if (beg == -1)
return Qnil;
6051 len = RSTRING_LEN(indx);
6063 beg = p - RSTRING_PTR(str);
6072 beg = p - RSTRING_PTR(str);
6076 rb_enc_cr_str_copy_for_substr(result, str);
6084 char *sptr = RSTRING_PTR(str);
6085 long slen = RSTRING_LEN(str);
6086 if (beg +
len > slen)
6090 slen - (beg +
len));
6092 STR_SET_LEN(str, slen);
6093 TERM_FILL(&sptr[slen], TERM_LEN(str));
6104 switch (OBJ_BUILTIN_TYPE(pat)) {
6123get_pat_quoted(
VALUE pat,
int check)
6127 switch (OBJ_BUILTIN_TYPE(pat)) {
6141 if (check && is_broken_string(pat)) {
6148rb_pat_search0(
VALUE pat,
VALUE str,
long pos,
int set_backref_str,
VALUE *match)
6151 pos = rb_str_byteindex(str, pat, pos);
6152 if (set_backref_str) {
6154 str = rb_str_new_frozen_String(str);
6155 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6157 *match = match_data;
6167 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6172rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6174 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6192rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6206 hash = rb_check_hash_type(argv[1]);
6212 pat = get_pat_quoted(argv[0], 1);
6214 str_modifiable(str);
6215 beg = rb_pat_search(pat, str, 0, 1);
6229 end0 = beg0 + RSTRING_LEN(pat);
6238 if (iter || !
NIL_P(hash)) {
6239 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6245 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6248 str_mod_check(str, p,
len);
6249 rb_check_frozen(str);
6255 enc = rb_enc_compatible(str, repl);
6257 rb_encoding *str_enc = STR_ENC_GET(str);
6258 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6262 rb_enc_inspect_name(str_enc),
6263 rb_enc_inspect_name(STR_ENC_GET(repl)));
6265 enc = STR_ENC_GET(repl);
6268 rb_enc_associate(str, enc);
6278 rlen = RSTRING_LEN(repl);
6279 len = RSTRING_LEN(str);
6281 RESIZE_CAPA(str,
len + rlen - plen);
6283 p = RSTRING_PTR(str);
6285 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6287 rp = RSTRING_PTR(repl);
6288 memmove(p + beg0, rp, rlen);
6290 STR_SET_LEN(str,
len);
6291 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6314 rb_str_sub_bang(argc, argv, str);
6319str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6322 long beg, beg0, end0;
6323 long offset, blen, slen,
len, last;
6324 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6326 int need_backref_str = -1;
6327 rb_encoding *str_enc;
6336 hash = rb_check_hash_type(argv[1]);
6340 else if (rb_hash_default_unredefined(hash) && !
FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6348 rb_error_arity(argc, 1, 2);
6351 pat = get_pat_quoted(argv[0], 1);
6352 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6355 if (bang)
return Qnil;
6360 blen = RSTRING_LEN(str) + 30;
6362 sp = RSTRING_PTR(str);
6363 slen = RSTRING_LEN(str);
6365 str_enc = STR_ENC_GET(str);
6366 rb_enc_associate(dest, str_enc);
6373 end0 = beg0 + RSTRING_LEN(pat);
6387 struct RString fake_str = {RBASIC_INIT};
6389 if (mode == FAST_MAP) {
6398 val = rb_hash_aref(hash, key);
6401 str_mod_check(str, sp, slen);
6406 else if (need_backref_str) {
6408 if (need_backref_str < 0) {
6409 need_backref_str = val != repl;
6416 len = beg0 - offset;
6433 offset = end0 +
len;
6439 if (mode != FAST_MAP && mode != STR) {
6442 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6450 rb_pat_search0(pat, str, last, 1, &match);
6452 str_shared_replace(str, dest);
6477rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6479 str_modify_keep_cr(str);
6480 return str_gsub(argc, argv, str, 1);
6530 return str_gsub(argc, argv, str, 0);
6550 str_modifiable(str);
6551 if (str == str2)
return str;
6555 return str_replace(str, str2);
6572rb_str_clear(
VALUE str)
6576 STR_SET_LEN(str, 0);
6578 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6594rb_str_chr(
VALUE str)
6637 char *ptr, *head, *left = 0;
6641 if (pos < -
len ||
len <= pos)
6648 char byte = (char)(
NUM2INT(w) & 0xFF);
6650 if (!str_independent(str))
6651 str_make_independent(str);
6652 enc = STR_ENC_GET(str);
6655 if (!STR_EMBED_P(str)) {
6661 if (ISASCII(
byte))
goto end;
6662 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6670 width = rb_enc_precise_mbclen(left, head+
len, enc);
6672 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6688str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6692 if (beg > n ||
len < 0)
return Qnil;
6695 if (beg < 0)
return Qnil;
6700 if (!empty)
return Qnil;
6704 VALUE str2 = str_subseq(str, beg,
len);
6706 str_enc_copy_direct(str2, str);
6709 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6751 return str_byte_substr(str, beg,
len, TRUE);
6756 return str_byte_substr(str, idx, 1, FALSE);
6768rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6773 return str_byte_substr(str, beg,
len, TRUE);
6776 return str_byte_aref(str, argv[0]);
6780str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6785 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6794 if (*
len > slen - *beg) {
6798 str_ensure_byte_pos(str, *beg);
6799 str_ensure_byte_pos(str, end);
6813rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6815 long beg,
len, vbeg, vlen;
6820 if (!(argc == 2 || argc == 3 || argc == 5)) {
6821 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6825 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6826 rb_builtin_class_name(argv[0]));
6838 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6839 rb_builtin_class_name(argv[2]));
6859 str_check_beg_len(str, &beg, &
len);
6860 str_check_beg_len(val, &vbeg, &vlen);
6861 str_modify_keep_cr(str);
6864 rb_enc_associate(str, rb_enc_check(str, val));
6867 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6889rb_str_reverse(
VALUE str)
6897 enc = STR_ENC_GET(str);
6904 if (single_byte_optimizable(str)) {
6911 int clen = rb_enc_fast_mbclen(s, e, enc);
6919 cr = rb_enc_asciicompat(enc) ?
6922 int clen = rb_enc_mbclen(s, e, enc);
6932 str_enc_copy_direct(rev, str);
6954rb_str_reverse_bang(
VALUE str)
6957 if (single_byte_optimizable(str)) {
6960 str_modify_keep_cr(str);
6970 str_shared_replace(str, rb_str_reverse(str));
6974 str_modify_keep_cr(str);
7003 i = rb_str_index(str, arg, 0);
7005 return RBOOL(i != -1);
7049 rb_raise(rb_eArgError,
"invalid radix %d", base);
7051 return rb_str_to_inum(str, base, FALSE);
7076rb_str_to_f(
VALUE str)
7093rb_str_to_s(
VALUE str)
7103str_cat_char(
VALUE str,
unsigned int c, rb_encoding *enc)
7105 char s[RUBY_MAX_CHAR_LEN];
7106 int n = rb_enc_codelen(c, enc);
7108 rb_enc_mbcput(c, s, enc);
7113#define CHAR_ESC_LEN 13
7116rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7118 char buf[CHAR_ESC_LEN + 1];
7126 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7128 else if (c < 0x10000) {
7129 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7132 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7137 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7140 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7143 l = (int)strlen(buf);
7149ruby_escaped_char(
int c)
7152 case '\0':
return "\\0";
7153 case '\n':
return "\\n";
7154 case '\r':
return "\\r";
7155 case '\t':
return "\\t";
7156 case '\f':
return "\\f";
7157 case '\013':
return "\\v";
7158 case '\010':
return "\\b";
7159 case '\007':
return "\\a";
7160 case '\033':
return "\\e";
7161 case '\x7f':
return "\\c?";
7167rb_str_escape(
VALUE str)
7170 rb_encoding *enc = rb_enc_from_index(encidx);
7173 const char *prev = p;
7174 char buf[CHAR_ESC_LEN + 1];
7176 int unicode_p = rb_enc_unicode_p(enc);
7177 int asciicompat = rb_enc_asciicompat(enc);
7182 int n = rb_enc_precise_mbclen(p, pend, enc);
7184 if (p > prev) str_buf_cat(result, prev, p - prev);
7185 n = rb_enc_mbminlen(enc);
7187 n = (int)(pend - p);
7189 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7190 str_buf_cat(result, buf, strlen(buf));
7196 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7198 cc = ruby_escaped_char(c);
7200 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7201 str_buf_cat(result, cc, strlen(cc));
7204 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7207 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7208 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7212 if (p > prev) str_buf_cat(result, prev, p - prev);
7230 rb_encoding *enc = rb_enc_from_index(encidx);
7231 const char *p, *pend, *prev;
7232 char buf[CHAR_ESC_LEN + 1];
7235 int unicode_p = rb_enc_unicode_p(enc);
7236 int asciicompat = rb_enc_asciicompat(enc);
7240 rb_enc_associate(result, resenc);
7241 str_buf_cat2(result,
"\"");
7249 n = rb_enc_precise_mbclen(p, pend, enc);
7251 if (p > prev) str_buf_cat(result, prev, p - prev);
7252 n = rb_enc_mbminlen(enc);
7254 n = (int)(pend - p);
7256 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7257 str_buf_cat(result, buf, strlen(buf));
7263 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7265 if ((asciicompat || unicode_p) &&
7266 (c ==
'"'|| c ==
'\\' ||
7271 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7272 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7273 str_buf_cat2(result,
"\\");
7274 if (asciicompat || enc == resenc) {
7280 case '\n': cc =
'n';
break;
7281 case '\r': cc =
'r';
break;
7282 case '\t': cc =
't';
break;
7283 case '\f': cc =
'f';
break;
7284 case '\013': cc =
'v';
break;
7285 case '\010': cc =
'b';
break;
7286 case '\007': cc =
'a';
break;
7287 case 033: cc =
'e';
break;
7288 default: cc = 0;
break;
7291 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7294 str_buf_cat(result, buf, 2);
7307 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7311 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7312 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7317 if (p > prev) str_buf_cat(result, prev, p - prev);
7318 str_buf_cat2(result,
"\"");
7323#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7336 int encidx = rb_enc_get_index(str);
7337 rb_encoding *enc = rb_enc_from_index(encidx);
7339 const char *p, *pend;
7343 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7346 if (!rb_enc_asciicompat(enc)) {
7348 len += strlen(enc->name);
7354 unsigned char c = *p++;
7357 case '"':
case '\\':
7358 case '\n':
case '\r':
7359 case '\t':
case '\f':
7360 case '\013':
case '\010':
case '\007':
case '\033':
7365 clen = IS_EVSTR(p, pend) ? 2 : 1;
7373 if (u8 && c > 0x7F) {
7374 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7376 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7379 else if (cc <= 0xFFFFF)
7392 if (clen > LONG_MAX -
len) {
7404 unsigned char c = *p++;
7406 if (c ==
'"' || c ==
'\\') {
7410 else if (c ==
'#') {
7411 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7414 else if (c ==
'\n') {
7418 else if (c ==
'\r') {
7422 else if (c ==
'\t') {
7426 else if (c ==
'\f') {
7430 else if (c ==
'\013') {
7434 else if (c ==
'\010') {
7438 else if (c ==
'\007') {
7442 else if (c ==
'\033') {
7452 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7454 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7457 snprintf(q, qend-q,
"u%04X", cc);
7459 snprintf(q, qend-q,
"u{%X}", cc);
7464 snprintf(q, qend-q,
"x%02X", c);
7470 if (!rb_enc_asciicompat(enc)) {
7471 snprintf(q, qend-q, nonascii_suffix, enc->name);
7475 rb_enc_associate_index(result, encidx);
7481unescape_ascii(
unsigned int c)
7505undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end, rb_encoding **penc,
bool *utf8,
bool *binary)
7507 const char *s = *ss;
7511 unsigned char buf[6];
7512 static rb_encoding *enc_utf8 = NULL;
7529 *buf = unescape_ascii(*s);
7542 if (*penc != enc_utf8) {
7544 rb_enc_associate(undumped, enc_utf8);
7561 if (hexlen == 0 || hexlen > 6) {
7567 if (0xd800 <= c && c <= 0xdfff) {
7570 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7580 if (0xd800 <= c && c <= 0xdfff) {
7583 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7596 if (!ISASCII(*buf)) {
7613static VALUE rb_str_is_ascii_only_p(
VALUE str);
7625str_undump(
VALUE str)
7629 rb_encoding *enc = rb_enc_get(str);
7630 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7632 bool binary =
false;
7636 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7639 if (!str_null_check(str, &w)) {
7643 if (*s !=
'"')
goto invalid_format;
7661 static const char force_encoding_suffix[] =
".force_encoding(\"";
7662 static const char dup_suffix[] =
".dup";
7663 const char *encname;
7668 size =
sizeof(dup_suffix) - 1;
7669 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7671 size =
sizeof(force_encoding_suffix) - 1;
7672 if (s_end - s <= size)
goto invalid_format;
7673 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7677 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7681 s = memchr(s,
'"', s_end-s);
7683 if (!s)
goto invalid_format;
7684 if (s_end - s != 2)
goto invalid_format;
7685 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7687 encidx = rb_enc_find_index2(encname, (
long)size);
7691 rb_enc_associate_index(undumped, encidx);
7701 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7712 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7716rb_str_check_dummy_enc(rb_encoding *enc)
7718 if (rb_enc_dummy_p(enc)) {
7725str_true_enc(
VALUE str)
7727 rb_encoding *enc = STR_ENC_GET(str);
7728 rb_str_check_dummy_enc(enc);
7732static OnigCaseFoldType
7733check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7738 rb_raise(rb_eArgError,
"too many options");
7739 if (argv[0]==sym_turkic) {
7740 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7742 if (argv[1]==sym_lithuanian)
7743 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7745 rb_raise(rb_eArgError,
"invalid second option");
7748 else if (argv[0]==sym_lithuanian) {
7749 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7751 if (argv[1]==sym_turkic)
7752 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7754 rb_raise(rb_eArgError,
"invalid second option");
7758 rb_raise(rb_eArgError,
"too many options");
7759 else if (argv[0]==sym_ascii)
7760 flags |= ONIGENC_CASE_ASCII_ONLY;
7761 else if (argv[0]==sym_fold) {
7762 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7763 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7765 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7768 rb_raise(rb_eArgError,
"invalid option");
7773case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc,
VALUE str)
7781#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7782#ifndef CASEMAP_DEBUG
7783# define CASEMAP_DEBUG 0
7791 OnigUChar space[FLEX_ARY_LEN];
7795mapping_buffer_free(
void *p)
7799 while (current_buffer) {
7800 previous_buffer = current_buffer;
7801 current_buffer = current_buffer->next;
7802 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7808 {0, mapping_buffer_free,},
7809 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7813rb_str_casemap(
VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7817 const OnigUChar *source_current, *source_end;
7818 int target_length = 0;
7819 VALUE buffer_anchor;
7822 size_t buffer_count = 0;
7823 int buffer_length_or_invalid;
7825 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7827 source_current = (OnigUChar*)RSTRING_PTR(source);
7832 while (source_current < source_end) {
7834 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7835 if (CASEMAP_DEBUG) {
7836 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7839 *pre_buffer = current_buffer;
7840 pre_buffer = ¤t_buffer->next;
7841 current_buffer->next = NULL;
7842 current_buffer->capa =
capa;
7843 buffer_length_or_invalid = enc->case_map(flags,
7844 &source_current, source_end,
7845 current_buffer->space,
7846 current_buffer->space+current_buffer->capa,
7848 if (buffer_length_or_invalid < 0) {
7849 current_buffer =
DATA_PTR(buffer_anchor);
7851 mapping_buffer_free(current_buffer);
7852 rb_raise(rb_eArgError,
"input string invalid");
7854 target_length += current_buffer->used = buffer_length_or_invalid;
7856 if (CASEMAP_DEBUG) {
7857 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7860 if (buffer_count==1) {
7861 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7864 char *target_current;
7867 target_current = RSTRING_PTR(target);
7868 current_buffer =
DATA_PTR(buffer_anchor);
7869 while (current_buffer) {
7870 memcpy(target_current, current_buffer->space, current_buffer->used);
7871 target_current += current_buffer->used;
7872 current_buffer = current_buffer->next;
7875 current_buffer =
DATA_PTR(buffer_anchor);
7877 mapping_buffer_free(current_buffer);
7882 str_enc_copy_direct(target, source);
7889rb_str_ascii_casemap(
VALUE source,
VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7891 const OnigUChar *source_current, *source_end;
7892 OnigUChar *target_current, *target_end;
7893 long old_length = RSTRING_LEN(source);
7894 int length_or_invalid;
7896 if (old_length == 0)
return Qnil;
7898 source_current = (OnigUChar*)RSTRING_PTR(source);
7900 if (source == target) {
7901 target_current = (OnigUChar*)source_current;
7902 target_end = (OnigUChar*)source_end;
7905 target_current = (OnigUChar*)RSTRING_PTR(target);
7909 length_or_invalid = onigenc_ascii_only_case_map(flags,
7910 &source_current, source_end,
7911 target_current, target_end, enc);
7912 if (length_or_invalid < 0)
7913 rb_raise(rb_eArgError,
"input string invalid");
7914 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7915 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7916 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7917 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7918 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7921 str_enc_copy(target, source);
7927upcase_single(
VALUE str)
7929 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
7930 bool modified =
false;
7933 unsigned int c = *(
unsigned char*)s;
7935 if (
'a' <= c && c <=
'z') {
7936 *s =
'A' + (c -
'a');
7957rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7960 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7962 flags = check_case_options(argc, argv, flags);
7963 str_modify_keep_cr(str);
7964 enc = str_true_enc(str);
7965 if (case_option_single_p(flags, enc, str)) {
7966 if (upcase_single(str))
7967 flags |= ONIGENC_CASE_MODIFIED;
7969 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7970 rb_str_ascii_casemap(str, str, &flags, enc);
7972 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7974 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7987rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
7990 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7993 flags = check_case_options(argc, argv, flags);
7994 enc = str_true_enc(str);
7995 if (case_option_single_p(flags, enc, str)) {
7996 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7997 str_enc_copy_direct(ret, str);
8000 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8002 rb_str_ascii_casemap(str, ret, &flags, enc);
8005 ret = rb_str_casemap(str, &flags, enc);
8012downcase_single(
VALUE str)
8014 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8015 bool modified =
false;
8018 unsigned int c = *(
unsigned char*)s;
8020 if (
'A' <= c && c <=
'Z') {
8021 *s =
'a' + (c -
'A');
8043rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8046 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8048 flags = check_case_options(argc, argv, flags);
8049 str_modify_keep_cr(str);
8050 enc = str_true_enc(str);
8051 if (case_option_single_p(flags, enc, str)) {
8052 if (downcase_single(str))
8053 flags |= ONIGENC_CASE_MODIFIED;
8055 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8056 rb_str_ascii_casemap(str, str, &flags, enc);
8058 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8060 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8074rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8077 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8080 flags = check_case_options(argc, argv, flags);
8081 enc = str_true_enc(str);
8082 if (case_option_single_p(flags, enc, str)) {
8083 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8084 str_enc_copy_direct(ret, str);
8085 downcase_single(ret);
8087 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8089 rb_str_ascii_casemap(str, ret, &flags, enc);
8092 ret = rb_str_casemap(str, &flags, enc);
8112rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8115 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8117 flags = check_case_options(argc, argv, flags);
8118 str_modify_keep_cr(str);
8119 enc = str_true_enc(str);
8120 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8121 if (flags&ONIGENC_CASE_ASCII_ONLY)
8122 rb_str_ascii_casemap(str, str, &flags, enc);
8124 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8126 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8140rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8143 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8146 flags = check_case_options(argc, argv, flags);
8147 enc = str_true_enc(str);
8148 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8149 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8151 rb_str_ascii_casemap(str, ret, &flags, enc);
8154 ret = rb_str_casemap(str, &flags, enc);
8173rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8176 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8178 flags = check_case_options(argc, argv, flags);
8179 str_modify_keep_cr(str);
8180 enc = str_true_enc(str);
8181 if (flags&ONIGENC_CASE_ASCII_ONLY)
8182 rb_str_ascii_casemap(str, str, &flags, enc);
8184 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8186 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8200rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8203 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8206 flags = check_case_options(argc, argv, flags);
8207 enc = str_true_enc(str);
8208 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8209 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8211 rb_str_ascii_casemap(str, ret, &flags, enc);
8214 ret = rb_str_casemap(str, &flags, enc);
8219typedef unsigned char *USTR;
8223 unsigned int now, max;
8228trnext(
struct tr *t, rb_encoding *enc)
8235 if (t->p == t->pend)
return -1;
8236 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8239 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8241 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8243 if (t->p < t->pend) {
8244 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8247 if (t->now < 0x80 && c < 0x80) {
8248 rb_raise(rb_eArgError,
8249 "invalid range \"%c-%c\" in string transliteration",
8253 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8257 else if (t->now < c) {
8266 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8267 if (t->now == t->max) {
8272 if (t->now < t->max) {
8288 const unsigned int errc = -1;
8289 unsigned int trans[256];
8290 rb_encoding *enc, *e1, *e2;
8291 struct tr trsrc, trrepl;
8293 unsigned int c, c0, last = 0;
8294 int modify = 0, i, l;
8295 unsigned char *s, *send;
8297 int singlebyte = single_byte_optimizable(str);
8301#define CHECK_IF_ASCII(c) \
8302 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8303 (cr = ENC_CODERANGE_VALID) : 0)
8307 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8308 if (RSTRING_LEN(repl) == 0) {
8309 return rb_str_delete_bang(1, &src, str);
8313 e1 = rb_enc_check(str, src);
8314 e2 = rb_enc_check(str, repl);
8319 enc = rb_enc_check(src, repl);
8321 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8322 if (RSTRING_LEN(src) > 1 &&
8323 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8324 trsrc.p + l < trsrc.pend) {
8328 trrepl.p = RSTRING_PTR(repl);
8329 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8330 trsrc.gen = trrepl.gen = 0;
8331 trsrc.now = trrepl.now = 0;
8332 trsrc.max = trrepl.max = 0;
8335 for (i=0; i<256; i++) {
8338 while ((c = trnext(&trsrc, enc)) != errc) {
8347 while ((c = trnext(&trrepl, enc)) != errc)
8350 for (i=0; i<256; i++) {
8351 if (trans[i] != errc) {
8359 for (i=0; i<256; i++) {
8362 while ((c = trnext(&trsrc, enc)) != errc) {
8363 r = trnext(&trrepl, enc);
8364 if (r == errc) r = trrepl.now;
8367 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8378 str_modify_keep_cr(str);
8379 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8380 termlen = rb_enc_mbminlen(enc);
8383 long offset, max = RSTRING_LEN(str);
8384 unsigned int save = -1;
8385 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8390 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8393 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8396 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8398 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8407 if (cflag) c = last;
8410 else if (cflag) c = errc;
8416 if (c != (
unsigned int)-1) {
8422 tlen = rb_enc_codelen(c, enc);
8428 if (enc != e1) may_modify = 1;
8430 if ((offset = t - buf) + tlen > max) {
8431 size_t MAYBE_UNUSED(old) = max + termlen;
8432 max = offset + tlen + (send - s);
8433 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8436 rb_enc_mbcput(c, t, enc);
8437 if (may_modify && memcmp(s, t, tlen) != 0) {
8443 if (!STR_EMBED_P(str)) {
8444 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8446 TERM_FILL((
char *)t, termlen);
8447 RSTRING(str)->as.heap.ptr = (
char *)buf;
8448 STR_SET_LEN(str, t - buf);
8449 STR_SET_NOEMBED(str);
8450 RSTRING(str)->as.heap.aux.capa = max;
8454 c = (
unsigned char)*s;
8455 if (trans[c] != errc) {
8472 long offset, max = (long)((send - s) * 1.2);
8473 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8478 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8481 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8484 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8486 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8494 if (cflag) c = last;
8497 else if (cflag) c = errc;
8501 c = cflag ? last : errc;
8504 tlen = rb_enc_codelen(c, enc);
8509 if (enc != e1) may_modify = 1;
8511 if ((offset = t - buf) + tlen > max) {
8512 size_t MAYBE_UNUSED(old) = max + termlen;
8513 max = offset + tlen + (long)((send - s) * 1.2);
8514 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8518 rb_enc_mbcput(c, t, enc);
8519 if (may_modify && memcmp(s, t, tlen) != 0) {
8527 if (!STR_EMBED_P(str)) {
8528 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8530 TERM_FILL((
char *)t, termlen);
8531 RSTRING(str)->as.heap.ptr = (
char *)buf;
8532 STR_SET_LEN(str, t - buf);
8533 STR_SET_NOEMBED(str);
8534 RSTRING(str)->as.heap.aux.capa = max;
8540 rb_enc_associate(str, enc);
8562 return tr_trans(str, src, repl, 0);
8607 tr_trans(str, src, repl, 0);
8611#define TR_TABLE_MAX (UCHAR_MAX+1)
8612#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8614tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8615 VALUE *tablep,
VALUE *ctablep, rb_encoding *enc)
8617 const unsigned int errc = -1;
8618 char buf[TR_TABLE_MAX];
8621 VALUE table = 0, ptable = 0;
8622 int i, l, cflag = 0;
8624 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8625 tr.gen =
tr.now =
tr.max = 0;
8627 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8632 for (i=0; i<TR_TABLE_MAX; i++) {
8635 stable[TR_TABLE_MAX] = cflag;
8637 else if (stable[TR_TABLE_MAX] && !cflag) {
8638 stable[TR_TABLE_MAX] = 0;
8640 for (i=0; i<TR_TABLE_MAX; i++) {
8644 while ((c = trnext(&
tr, enc)) != errc) {
8645 if (c < TR_TABLE_MAX) {
8646 buf[(
unsigned char)c] = !cflag;
8651 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8663 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8664 rb_hash_aset(table, key,
Qtrue);
8668 for (i=0; i<TR_TABLE_MAX; i++) {
8669 stable[i] = stable[i] && buf[i];
8671 if (!table && !cflag) {
8678tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8680 if (c < TR_TABLE_MAX) {
8681 return table[c] != 0;
8687 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8688 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8692 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8695 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8710rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8712 char squeez[TR_TABLE_SIZE];
8713 rb_encoding *enc = 0;
8715 VALUE del = 0, nodel = 0;
8717 int i, ascompat, cr;
8719 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8721 for (i=0; i<argc; i++) {
8725 enc = rb_enc_check(str, s);
8726 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8729 str_modify_keep_cr(str);
8730 ascompat = rb_enc_asciicompat(enc);
8731 s = t = RSTRING_PTR(str);
8738 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8749 c = rb_enc_codepoint_len(s, send, &clen, enc);
8751 if (tr_find(c, squeez, del, nodel)) {
8755 if (t != s) rb_enc_mbcput(c, t, enc);
8762 TERM_FILL(t, TERM_LEN(str));
8763 STR_SET_LEN(str, t - RSTRING_PTR(str));
8766 if (modify)
return str;
8780rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8783 rb_str_delete_bang(argc, argv, str);
8801rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8803 char squeez[TR_TABLE_SIZE];
8804 rb_encoding *enc = 0;
8805 VALUE del = 0, nodel = 0;
8806 unsigned char *s, *send, *t;
8808 int ascompat, singlebyte = single_byte_optimizable(str);
8812 enc = STR_ENC_GET(str);
8815 for (i=0; i<argc; i++) {
8819 enc = rb_enc_check(str, s);
8820 if (singlebyte && !single_byte_optimizable(s))
8822 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8826 str_modify_keep_cr(str);
8827 s = t = (
unsigned char *)RSTRING_PTR(str);
8828 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8831 ascompat = rb_enc_asciicompat(enc);
8835 unsigned int c = *s++;
8836 if (c != save || (argc > 0 && !squeez[c])) {
8846 if (ascompat && (c = *s) < 0x80) {
8847 if (c != save || (argc > 0 && !squeez[c])) {
8853 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8855 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8856 if (t != s) rb_enc_mbcput(c, t, enc);
8865 TERM_FILL((
char *)t, TERM_LEN(str));
8866 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8867 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8871 if (modify)
return str;
8885rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8888 rb_str_squeeze_bang(argc, argv, str);
8908 return tr_trans(str, src, repl, 1);
8936 tr_trans(str, src, repl, 1);
8949rb_str_count(
int argc,
VALUE *argv,
VALUE str)
8951 char table[TR_TABLE_SIZE];
8952 rb_encoding *enc = 0;
8953 VALUE del = 0, nodel = 0, tstr;
8963 enc = rb_enc_check(str, tstr);
8966 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8967 (ptstr = RSTRING_PTR(tstr),
8968 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
8969 !is_broken_string(str)) {
8971 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8973 s = RSTRING_PTR(str);
8974 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
8977 if (*(
unsigned char*)s++ == c) n++;
8983 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8984 for (i=1; i<argc; i++) {
8987 enc = rb_enc_check(str, tstr);
8988 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8991 s = RSTRING_PTR(str);
8992 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
8994 ascompat = rb_enc_asciicompat(enc);
8998 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9006 c = rb_enc_codepoint_len(s, send, &clen, enc);
9007 if (tr_find(c, table, del, nodel)) {
9018rb_fs_check(
VALUE val)
9022 if (
NIL_P(val))
return 0;
9027static const char isspacetable[256] = {
9028 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9030 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9034 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9035 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9036 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9037 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9038 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9039 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9040 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9041 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9042 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9043 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9046#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9049split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9051 if (empty_count >= 0 &&
len == 0) {
9052 return empty_count + 1;
9054 if (empty_count > 0) {
9059 }
while (--empty_count > 0);
9063 rb_yield(str_new_empty_String(str));
9064 }
while (--empty_count > 0);
9078 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9082literal_split_pattern(
VALUE spat, split_type_t default_type)
9084 rb_encoding *enc = STR_ENC_GET(spat);
9090 return SPLIT_TYPE_CHARS;
9092 else if (rb_enc_asciicompat(enc)) {
9093 if (
len == 1 && ptr[0] ==
' ') {
9094 return SPLIT_TYPE_AWK;
9099 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9100 return SPLIT_TYPE_AWK;
9103 return default_type;
9116rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9121 split_type_t split_type;
9122 long beg, end, i = 0, empty_count = -1;
9127 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9129 if (lim <= 0) limit =
Qnil;
9130 else if (lim == 1) {
9131 if (RSTRING_LEN(str) == 0)
9142 if (
NIL_P(limit) && !lim) empty_count = 0;
9144 enc = STR_ENC_GET(str);
9145 split_type = SPLIT_TYPE_REGEXP;
9147 spat = get_pat_quoted(spat, 0);
9150 split_type = SPLIT_TYPE_AWK;
9152 else if (!(spat = rb_fs_check(spat))) {
9153 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9158 if (split_type != SPLIT_TYPE_AWK) {
9163 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9164 if (split_type == SPLIT_TYPE_AWK) {
9166 split_type = SPLIT_TYPE_STRING;
9171 mustnot_broken(spat);
9172 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9180#define SPLIT_STR(beg, len) ( \
9181 empty_count = split_string(result, str, beg, len, empty_count), \
9182 str_mod_check(str, str_start, str_len))
9185 char *ptr = RSTRING_PTR(str);
9186 char *
const str_start = ptr;
9187 const long str_len = RSTRING_LEN(str);
9188 char *
const eptr = str_start + str_len;
9189 if (split_type == SPLIT_TYPE_AWK) {
9196 if (is_ascii_string(str)) {
9197 while (ptr < eptr) {
9198 c = (
unsigned char)*ptr++;
9200 if (ascii_isspace(c)) {
9206 if (!
NIL_P(limit) && lim <= i)
break;
9209 else if (ascii_isspace(c)) {
9210 SPLIT_STR(beg, end-beg);
9213 if (!
NIL_P(limit)) ++i;
9221 while (ptr < eptr) {
9224 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9233 if (!
NIL_P(limit) && lim <= i)
break;
9237 SPLIT_STR(beg, end-beg);
9240 if (!
NIL_P(limit)) ++i;
9248 else if (split_type == SPLIT_TYPE_STRING) {
9249 char *substr_start = ptr;
9250 char *sptr = RSTRING_PTR(spat);
9251 long slen = RSTRING_LEN(spat);
9254 mustnot_broken(str);
9255 enc = rb_enc_check(str, spat);
9256 while (ptr < eptr &&
9257 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9260 if (t != ptr + end) {
9264 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9265 str_mod_check(spat, sptr, slen);
9268 if (!
NIL_P(limit) && lim <= ++i)
break;
9270 beg = ptr - str_start;
9272 else if (split_type == SPLIT_TYPE_CHARS) {
9276 mustnot_broken(str);
9277 enc = rb_enc_get(str);
9278 while (ptr < eptr &&
9279 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9280 SPLIT_STR(ptr - str_start, n);
9282 if (!
NIL_P(limit) && lim <= ++i)
break;
9284 beg = ptr - str_start;
9288 long len = RSTRING_LEN(str);
9296 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9301 if (start == end && BEG(0) == END(0)) {
9306 else if (last_null == 1) {
9307 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9314 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9320 SPLIT_STR(beg, end-beg);
9321 beg = start = END(0);
9325 for (idx=1; idx < regs->num_regs; idx++) {
9326 if (BEG(idx) == -1)
continue;
9327 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9329 if (!
NIL_P(limit) && lim <= ++i)
break;
9331 if (match) rb_match_unbusy(match);
9333 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9334 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9337 return result ? result : str;
9347 return rb_str_split_m(1, &sep, str);
9350#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9365#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9368chomp_newline(
const char *p,
const char *e, rb_encoding *enc)
9370 const char *prev = rb_enc_prev_char(p, e, e, enc);
9373 prev = rb_enc_prev_char(p, e, e, enc);
9374 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9386 RSTRING_LEN(rs) != 1 ||
9387 RSTRING_PTR(rs)[0] !=
'\n')) {
9393#define rb_rs get_rs()
9400 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9401 long pos,
len, rslen;
9407 static ID keywords[1];
9412 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9416 if (!ENUM_ELEM(ary, str)) {
9424 if (!RSTRING_LEN(str))
goto end;
9426 ptr = subptr = RSTRING_PTR(str);
9428 len = RSTRING_LEN(str);
9430 rslen = RSTRING_LEN(rs);
9433 enc = rb_enc_get(str);
9435 enc = rb_enc_check(str, rs);
9440 const char *eol = NULL;
9442 while (subend < pend) {
9443 long chomp_rslen = 0;
9445 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9447 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9449 if (eol == subend)
break;
9453 chomp_rslen = -rslen;
9457 if (!subptr) subptr = subend;
9461 }
while (subend < pend);
9463 if (rslen == 0) chomp_rslen = 0;
9465 subend - subptr + (chomp ? chomp_rslen : rslen));
9466 if (ENUM_ELEM(ary, line)) {
9467 str_mod_check(str, ptr,
len);
9469 subptr = eol = NULL;
9474 rsptr = RSTRING_PTR(rs);
9475 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9484 rsptr = RSTRING_PTR(rs);
9485 rslen = RSTRING_LEN(rs);
9488 while (subptr < pend) {
9489 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9493 if (hit != adjusted) {
9497 subend = hit += rslen;
9500 subend = chomp_newline(subptr, subend, enc);
9507 if (ENUM_ELEM(ary, line)) {
9508 str_mod_check(str, ptr,
len);
9513 if (subptr != pend) {
9516 pend = chomp_newline(subptr, pend, enc);
9518 else if (pend - subptr >= rslen &&
9519 memcmp(pend - rslen, rsptr, rslen) == 0) {
9524 ENUM_ELEM(ary, line);
9545rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9548 return rb_str_enumerate_lines(argc, argv, str, 0);
9603rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9605 VALUE ary = WANTARRAY(
"lines", 0);
9606 return rb_str_enumerate_lines(argc, argv, str, ary);
9620 for (i=0; i<RSTRING_LEN(str); i++) {
9621 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9639rb_str_each_byte(
VALUE str)
9642 return rb_str_enumerate_bytes(str, 0);
9654rb_str_bytes(
VALUE str)
9656 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9657 return rb_str_enumerate_bytes(str, ary);
9675 ptr = RSTRING_PTR(str);
9676 len = RSTRING_LEN(str);
9677 enc = rb_enc_get(str);
9680 for (i = 0; i <
len; i += n) {
9681 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9686 for (i = 0; i <
len; i += n) {
9687 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9708rb_str_each_char(
VALUE str)
9711 return rb_str_enumerate_chars(str, 0);
9723rb_str_chars(
VALUE str)
9726 return rb_str_enumerate_chars(str, ary);
9730rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9735 const char *ptr, *end;
9738 if (single_byte_optimizable(str))
9739 return rb_str_enumerate_bytes(str, ary);
9742 ptr = RSTRING_PTR(str);
9744 enc = STR_ENC_GET(str);
9747 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9768rb_str_each_codepoint(
VALUE str)
9771 return rb_str_enumerate_codepoints(str, 0);
9783rb_str_codepoints(
VALUE str)
9786 return rb_str_enumerate_codepoints(str, ary);
9790get_reg_grapheme_cluster(rb_encoding *enc)
9792 int encidx = rb_enc_to_index(enc);
9794 const OnigUChar source_ascii[] =
"\\X";
9795 const OnigUChar *source = source_ascii;
9796 size_t source_len =
sizeof(source_ascii) - 1;
9799#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9800#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9801#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9802#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9803#define CASE_UTF(e) \
9804 case ENCINDEX_UTF_##e: { \
9805 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9806 source = source_UTF_##e; \
9807 source_len = sizeof(source_UTF_##e); \
9810 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9818 regex_t *reg_grapheme_cluster;
9820 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9821 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9823 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9824 onig_error_code_to_str(message, r, &einfo);
9825 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9828 return reg_grapheme_cluster;
9832get_cached_reg_grapheme_cluster(rb_encoding *enc)
9834 int encidx = rb_enc_to_index(enc);
9835 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9838 if (!reg_grapheme_cluster_utf8) {
9839 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9842 return reg_grapheme_cluster_utf8;
9851 size_t grapheme_cluster_count = 0;
9852 rb_encoding *enc = get_encoding(str);
9853 const char *ptr, *end;
9855 if (!rb_enc_unicode_p(enc)) {
9859 bool cached_reg_grapheme_cluster =
true;
9860 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9861 if (!reg_grapheme_cluster) {
9862 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9863 cached_reg_grapheme_cluster =
false;
9866 ptr = RSTRING_PTR(str);
9870 OnigPosition
len = onig_match(reg_grapheme_cluster,
9871 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9872 (
const OnigUChar *)ptr, NULL, 0);
9873 if (
len <= 0)
break;
9874 grapheme_cluster_count++;
9878 if (!cached_reg_grapheme_cluster) {
9879 onig_free(reg_grapheme_cluster);
9882 return SIZET2NUM(grapheme_cluster_count);
9886rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9889 rb_encoding *enc = get_encoding(str);
9890 const char *ptr0, *ptr, *end;
9892 if (!rb_enc_unicode_p(enc)) {
9893 return rb_str_enumerate_chars(str, ary);
9898 bool cached_reg_grapheme_cluster =
true;
9899 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9900 if (!reg_grapheme_cluster) {
9901 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9902 cached_reg_grapheme_cluster =
false;
9905 ptr0 = ptr = RSTRING_PTR(str);
9909 OnigPosition
len = onig_match(reg_grapheme_cluster,
9910 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9911 (
const OnigUChar *)ptr, NULL, 0);
9912 if (
len <= 0)
break;
9917 if (!cached_reg_grapheme_cluster) {
9918 onig_free(reg_grapheme_cluster);
9938rb_str_each_grapheme_cluster(
VALUE str)
9941 return rb_str_enumerate_grapheme_clusters(str, 0);
9953rb_str_grapheme_clusters(
VALUE str)
9956 return rb_str_enumerate_grapheme_clusters(str, ary);
9960chopped_length(
VALUE str)
9962 rb_encoding *enc = STR_ENC_GET(str);
9963 const char *p, *p2, *beg, *end;
9965 beg = RSTRING_PTR(str);
9966 end = beg + RSTRING_LEN(str);
9967 if (beg >= end)
return 0;
9968 p = rb_enc_prev_char(beg, end, end, enc);
9970 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
9971 p2 = rb_enc_prev_char(beg, p, end, enc);
9972 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
9990rb_str_chop_bang(
VALUE str)
9992 str_modify_keep_cr(str);
9993 if (RSTRING_LEN(str) > 0) {
9995 len = chopped_length(str);
9996 STR_SET_LEN(str,
len);
9997 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10016rb_str_chop(
VALUE str)
10022smart_chomp(
VALUE str,
const char *e,
const char *p)
10024 rb_encoding *enc = rb_enc_get(str);
10025 if (rb_enc_mbminlen(enc) > 1) {
10030 pp = e - rb_enc_mbminlen(enc);
10033 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10041 if (--e > p && *(e-1) ==
'\r') {
10058 char *pp, *e, *rsptr;
10060 char *
const p = RSTRING_PTR(str);
10061 long len = RSTRING_LEN(str);
10063 if (
len == 0)
return 0;
10066 return smart_chomp(str, e, p);
10069 enc = rb_enc_get(str);
10072 if (rb_enc_mbminlen(enc) > 1) {
10077 pp -= rb_enc_mbminlen(enc);
10080 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10087 while (e > p && *(e-1) ==
'\n') {
10089 if (e > p && *(e-1) ==
'\r')
10095 if (rslen >
len)
return len;
10097 enc = rb_enc_get(rs);
10098 newline = rsptr[rslen-1];
10099 if (rslen == rb_enc_mbminlen(enc)) {
10101 if (newline ==
'\n')
10102 return smart_chomp(str, e, p);
10106 return smart_chomp(str, e, p);
10110 enc = rb_enc_check(str, rs);
10111 if (is_broken_string(rs)) {
10115 if (p[
len-1] == newline &&
10117 memcmp(rsptr, pp, rslen) == 0)) {
10118 if (at_char_boundary(p, pp, e, enc))
10119 return len - rslen;
10131chomp_rs(
int argc,
const VALUE *argv)
10135 VALUE rs = argv[0];
10147 long olen = RSTRING_LEN(str);
10148 long len = chompped_length(str, rs);
10149 if (
len >= olen)
return Qnil;
10150 str_modify_keep_cr(str);
10151 STR_SET_LEN(str,
len);
10152 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10172rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10175 str_modifiable(str);
10176 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10177 rs = chomp_rs(argc, argv);
10179 return rb_str_chomp_string(str, rs);
10192rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10194 VALUE rs = chomp_rs(argc, argv);
10200tr_setup_table_multi(
char table[TR_TABLE_SIZE],
VALUE *tablep,
VALUE *ctablep,
10201 VALUE str,
int num_selectors,
VALUE *selectors)
10205 for (i=0; i<num_selectors; i++) {
10206 VALUE selector = selectors[i];
10210 enc = rb_enc_check(str, selector);
10211 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10216lstrip_offset(
VALUE str,
const char *s,
const char *e, rb_encoding *enc)
10218 const char *
const start = s;
10220 if (!s || s >= e)
return 0;
10223 if (single_byte_optimizable(str)) {
10224 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10229 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10239lstrip_offset_table(
VALUE str,
const char *s,
const char *e, rb_encoding *enc,
10240 char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
10242 const char *
const start = s;
10244 if (!s || s >= e)
return 0;
10249 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10251 if (!tr_find(cc, table, del, nodel))
break;
10270rb_str_lstrip_bang(
int argc,
VALUE *argv,
VALUE str)
10274 long olen, loffset;
10276 str_modify_keep_cr(str);
10277 enc = STR_ENC_GET(str);
10280 char table[TR_TABLE_SIZE];
10281 VALUE del = 0, nodel = 0;
10283 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10284 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10287 loffset = lstrip_offset(str, start, start+olen, enc);
10291 long len = olen-loffset;
10292 s = start + loffset;
10293 memmove(start, s,
len);
10294 STR_SET_LEN(str,
len);
10295 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10330rb_str_lstrip(
int argc,
VALUE *argv,
VALUE str)
10337 char table[TR_TABLE_SIZE];
10338 VALUE del = 0, nodel = 0;
10340 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10341 loffset = lstrip_offset_table(str, start, start+
len, STR_ENC_GET(str), table, del, nodel);
10344 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10346 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10351rstrip_offset(
VALUE str,
const char *s,
const char *e, rb_encoding *enc)
10355 rb_str_check_dummy_enc(enc);
10359 if (!s || s >= e)
return 0;
10363 if (single_byte_optimizable(str)) {
10365 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10370 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10380rstrip_offset_table(
VALUE str,
const char *s,
const char *e, rb_encoding *enc,
10381 char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
10386 rb_str_check_dummy_enc(enc);
10390 if (!s || s >= e)
return 0;
10394 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10396 if (!tr_find(c, table, del, nodel))
break;
10416rb_str_rstrip_bang(
int argc,
VALUE *argv,
VALUE str)
10420 long olen, roffset;
10422 str_modify_keep_cr(str);
10423 enc = STR_ENC_GET(str);
10426 char table[TR_TABLE_SIZE];
10427 VALUE del = 0, nodel = 0;
10429 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10430 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10433 roffset = rstrip_offset(str, start, start+olen, enc);
10436 long len = olen - roffset;
10438 STR_SET_LEN(str,
len);
10439 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10473rb_str_rstrip(
int argc,
VALUE *argv,
VALUE str)
10477 long olen, roffset;
10479 enc = STR_ENC_GET(str);
10482 char table[TR_TABLE_SIZE];
10483 VALUE del = 0, nodel = 0;
10485 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10486 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10489 roffset = rstrip_offset(str, start, start+olen, enc);
10491 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10509rb_str_strip_bang(
int argc,
VALUE *argv,
VALUE str)
10512 long olen, loffset, roffset;
10515 str_modify_keep_cr(str);
10516 enc = STR_ENC_GET(str);
10520 char table[TR_TABLE_SIZE];
10521 VALUE del = 0, nodel = 0;
10523 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10524 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10525 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10528 loffset = lstrip_offset(str, start, start+olen, enc);
10529 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10532 if (loffset > 0 || roffset > 0) {
10533 long len = olen-roffset;
10536 memmove(start, start + loffset,
len);
10538 STR_SET_LEN(str,
len);
10539 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10574rb_str_strip(
int argc,
VALUE *argv,
VALUE str)
10577 long olen, loffset, roffset;
10578 rb_encoding *enc = STR_ENC_GET(str);
10583 char table[TR_TABLE_SIZE];
10584 VALUE del = 0, nodel = 0;
10586 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10587 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10588 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10591 loffset = lstrip_offset(str, start, start+olen, enc);
10592 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10595 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10600scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10603 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10609 end = pos + RSTRING_LEN(pat);
10619 rb_encoding *enc = STR_ENC_GET(str);
10623 if (RSTRING_LEN(str) > end)
10624 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10633 if (!regs || regs->num_regs == 1) {
10639 for (
int i = 1; i < regs->num_regs; i++) {
10670 long last = -1, prev = 0;
10671 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10673 pat = get_pat_quoted(pat, 1);
10674 mustnot_broken(str);
10678 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10683 if (last >= 0) rb_pat_search(pat, str, last, 1);
10688 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10692 str_mod_check(str, p,
len);
10694 if (last >= 0) rb_pat_search(pat, str, last, 1);
10746rb_str_hex(
VALUE str)
10748 return rb_str_to_inum(str, 16, FALSE);
10832rb_str_oct(
VALUE str)
10834 return rb_str_to_inum(str, -8, FALSE);
10837#ifndef HAVE_CRYPT_R
10842 rb_nativethread_lock_t lock;
10843} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10912# define CRYPT_END() ALLOCV_END(databuf)
10915 extern char *crypt(
const char *,
const char *);
10916# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10919 const char *s, *saltp;
10922 char salt_8bit_clean[3];
10926 mustnot_wchar(str);
10927 mustnot_wchar(salt);
10929 saltp = RSTRING_PTR(salt);
10930 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10931 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10935 if (!ISASCII((
unsigned char)saltp[0]) || !ISASCII((
unsigned char)saltp[1])) {
10936 salt_8bit_clean[0] = saltp[0] & 0x7f;
10937 salt_8bit_clean[1] = saltp[1] & 0x7f;
10938 salt_8bit_clean[2] =
'\0';
10939 saltp = salt_8bit_clean;
10944# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10945 data->initialized = 0;
10947 res = crypt_r(s, saltp, data);
10950 res = crypt(s, saltp);
10965 size_t res_size = strlen(res)+1;
10966 tmp_buf =
ALLOCA_N(
char, res_size);
10967 memcpy(tmp_buf, res, res_size);
11004 char *ptr, *p, *pend;
11007 unsigned long sum0 = 0;
11012 ptr = p = RSTRING_PTR(str);
11013 len = RSTRING_LEN(str);
11019 str_mod_check(str, ptr,
len);
11022 sum0 += (
unsigned char)*p;
11033 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
11034 sum0 &= (((
unsigned long)1)<<bits)-1;
11054rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
11058 long width,
len, flen = 1, fclen = 1;
11061 const char *f =
" ";
11062 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11064 int singlebyte = 1, cr;
11068 enc = STR_ENC_GET(str);
11069 termlen = rb_enc_mbminlen(enc);
11073 enc = rb_enc_check(str, pad);
11074 f = RSTRING_PTR(pad);
11075 flen = RSTRING_LEN(pad);
11076 fclen = str_strlen(pad, enc);
11077 singlebyte = single_byte_optimizable(pad);
11078 if (flen == 0 || fclen == 0) {
11079 rb_raise(rb_eArgError,
"zero width padding");
11082 len = str_strlen(str, enc);
11083 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
11085 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
11089 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11090 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11092 size = RSTRING_LEN(str);
11093 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11094 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11095 (
len += llen2 + rlen2) >= LONG_MAX - size) {
11096 rb_raise(rb_eArgError,
"argument too big");
11100 p = RSTRING_PTR(res);
11102 memset(p, *f, llen);
11106 while (llen >= fclen) {
11112 memcpy(p, f, llen2);
11116 memcpy(p, RSTRING_PTR(str), size);
11119 memset(p, *f, rlen);
11123 while (rlen >= fclen) {
11129 memcpy(p, f, rlen2);
11133 TERM_FILL(p, termlen);
11134 STR_SET_LEN(res, p-RSTRING_PTR(res));
11155rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
11157 return rb_str_justify(argc, argv, str,
'l');
11169rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
11171 return rb_str_justify(argc, argv, str,
'r');
11184rb_str_center(
int argc,
VALUE *argv,
VALUE str)
11186 return rb_str_justify(argc, argv, str,
'c');
11202 sep = get_pat_quoted(sep, 0);
11214 pos = rb_str_index(str, sep, 0);
11215 if (pos < 0)
goto failed;
11220 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11223 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11237 long pos = RSTRING_LEN(str);
11239 sep = get_pat_quoted(sep, 0);
11252 pos = rb_str_rindex(str, sep, pos);
11261 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11263 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11275rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11279 for (i=0; i<argc; i++) {
11280 VALUE tmp = argv[i];
11282 if (rb_reg_start_with_p(tmp, str))
11286 const char *p, *s, *e;
11291 enc = rb_enc_check(str, tmp);
11292 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11293 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11294 p = RSTRING_PTR(str);
11297 if (!at_char_right_boundary(p, s, e, enc))
11299 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11315rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11319 for (i=0; i<argc; i++) {
11320 VALUE tmp = argv[i];
11321 const char *p, *s, *e;
11326 enc = rb_enc_check(str, tmp);
11327 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11328 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11329 p = RSTRING_PTR(str);
11332 if (!at_char_boundary(p, s, e, enc))
11334 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11350deleted_prefix_length(
VALUE str,
VALUE prefix)
11352 const char *strptr, *prefixptr;
11353 long olen, prefixlen;
11354 rb_encoding *enc = rb_enc_get(str);
11358 if (!is_broken_string(prefix) ||
11359 !rb_enc_asciicompat(enc) ||
11360 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11361 enc = rb_enc_check(str, prefix);
11365 prefixlen = RSTRING_LEN(prefix);
11366 if (prefixlen <= 0)
return 0;
11367 olen = RSTRING_LEN(str);
11368 if (olen < prefixlen)
return 0;
11369 strptr = RSTRING_PTR(str);
11370 prefixptr = RSTRING_PTR(prefix);
11371 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11372 if (is_broken_string(prefix)) {
11373 if (!is_broken_string(str)) {
11377 const char *strend = strptr + olen;
11378 const char *after_prefix = strptr + prefixlen;
11379 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11400rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11403 str_modify_keep_cr(str);
11405 prefixlen = deleted_prefix_length(str, prefix);
11406 if (prefixlen <= 0)
return Qnil;
11420rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11424 prefixlen = deleted_prefix_length(str, prefix);
11425 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11427 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11440deleted_suffix_length(
VALUE str,
VALUE suffix)
11442 const char *strptr, *suffixptr;
11443 long olen, suffixlen;
11447 if (is_broken_string(suffix))
return 0;
11448 enc = rb_enc_check(str, suffix);
11451 suffixlen = RSTRING_LEN(suffix);
11452 if (suffixlen <= 0)
return 0;
11453 olen = RSTRING_LEN(str);
11454 if (olen < suffixlen)
return 0;
11455 strptr = RSTRING_PTR(str);
11456 suffixptr = RSTRING_PTR(suffix);
11457 const char *strend = strptr + olen;
11458 const char *before_suffix = strend - suffixlen;
11459 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11460 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11476rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11478 long olen, suffixlen,
len;
11479 str_modifiable(str);
11481 suffixlen = deleted_suffix_length(str, suffix);
11482 if (suffixlen <= 0)
return Qnil;
11484 olen = RSTRING_LEN(str);
11485 str_modify_keep_cr(str);
11486 len = olen - suffixlen;
11487 STR_SET_LEN(str,
len);
11488 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11504rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11508 suffixlen = deleted_suffix_length(str, suffix);
11509 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11511 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11518 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11524nil_setter_warning(
ID id)
11526 rb_warn_deprecated(
"non-nil '%"PRIsVALUE
"'", NULL, rb_id2str(
id));
11533 if (!
NIL_P(*var)) {
11534 nil_setter_warning(
id);
11541 val = rb_fs_check(val);
11544 "value of %"PRIsVALUE
" must be String or Regexp",
11548 nil_setter_warning(
id);
11565 str_modifiable(str);
11567 rb_encoding *encoding = rb_to_encoding(enc);
11568 int idx = rb_enc_to_index(encoding);
11575 rb_enc_associate_index(str, idx);
11599 if (STR_EMBED_P(str)) {
11600 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11605 str_replace_shared_without_enc(str2, str);
11607 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11637rb_str_valid_encoding_p(
VALUE str)
11657rb_str_is_ascii_only_p(
VALUE str)
11667 static const char ellipsis[] =
"...";
11668 const long ellipsislen =
sizeof(ellipsis) - 1;
11669 rb_encoding *
const enc = rb_enc_get(str);
11670 const long blen = RSTRING_LEN(str);
11671 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11672 VALUE estr, ret = 0;
11675 if (
len * rb_enc_mbminlen(enc) >= blen ||
11679 else if (
len <= ellipsislen ||
11681 if (rb_enc_asciicompat(enc)) {
11683 rb_enc_associate(ret, enc);
11690 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11695 rb_enc_from_encoding(enc), 0,
Qnil);
11702str_compat_and_valid(
VALUE str, rb_encoding *enc)
11708 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11711 rb_encoding *e = STR_ENC_GET(str);
11714 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11720static VALUE enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl,
int cr);
11725 rb_encoding *enc = STR_ENC_GET(str);
11730rb_enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl)
11733 if (enc == STR_ENC_GET(str)) {
11738 return enc_str_scrub(enc, str, repl, cr);
11742enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl,
int cr)
11746 const char *rep, *p, *e, *p1, *sp;
11752 rb_raise(rb_eArgError,
"both of block and replacement given");
11759 if (!
NIL_P(repl)) {
11760 repl = str_compat_and_valid(repl, enc);
11763 if (rb_enc_dummy_p(enc)) {
11766 encidx = rb_enc_to_index(enc);
11768#define DEFAULT_REPLACE_CHAR(str) do { \
11769 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11770 rep = replace; replen = (int)sizeof(replace); \
11773 slen = RSTRING_LEN(str);
11774 p = RSTRING_PTR(str);
11779 if (rb_enc_asciicompat(enc)) {
11785 else if (!
NIL_P(repl)) {
11786 rep = RSTRING_PTR(repl);
11787 replen = RSTRING_LEN(repl);
11791 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11795 DEFAULT_REPLACE_CHAR(
"?");
11800 p = search_nonascii(p, e);
11805 int ret = rb_enc_precise_mbclen(p, e, enc);
11824 if (e - p < clen) clen = e - p;
11831 for (; clen > 1; clen--) {
11832 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11843 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11844 str_mod_check(str, sp, slen);
11845 repl = str_compat_and_valid(repl, enc);
11852 p = search_nonascii(p, e);
11878 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11879 str_mod_check(str, sp, slen);
11880 repl = str_compat_and_valid(repl, enc);
11889 long mbminlen = rb_enc_mbminlen(enc);
11893 else if (!
NIL_P(repl)) {
11894 rep = RSTRING_PTR(repl);
11895 replen = RSTRING_LEN(repl);
11897 else if (encidx == ENCINDEX_UTF_16BE) {
11898 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11900 else if (encidx == ENCINDEX_UTF_16LE) {
11901 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11903 else if (encidx == ENCINDEX_UTF_32BE) {
11904 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11906 else if (encidx == ENCINDEX_UTF_32LE) {
11907 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11910 DEFAULT_REPLACE_CHAR(
"?");
11914 int ret = rb_enc_precise_mbclen(p, e, enc);
11927 if (e - p < clen) clen = e - p;
11928 if (clen <= mbminlen * 2) {
11933 for (; clen > mbminlen; clen-=mbminlen) {
11934 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11944 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11945 str_mod_check(str, sp, slen);
11946 repl = str_compat_and_valid(repl, enc);
11971 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11972 str_mod_check(str, sp, slen);
11973 repl = str_compat_and_valid(repl, enc);
12013str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
12021static ID id_normalize;
12022static ID id_normalized_p;
12023static VALUE mUnicodeNormalize;
12026unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
12028 static int UnicodeNormalizeRequired = 0;
12031 if (!UnicodeNormalizeRequired) {
12032 rb_require(
"unicode_normalize/normalize.rb");
12033 UnicodeNormalizeRequired = 1;
12037 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
12048rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
12050 return unicode_normalize_common(argc, argv, str, id_normalize);
12064rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
12066 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12093rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
12095 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12227#define sym_equal rb_obj_equal
12230sym_printable(
const char *s,
const char *send, rb_encoding *enc)
12234 int c = rb_enc_precise_mbclen(s, send, enc);
12238 c = rb_enc_mbc_to_codepoint(s, send, enc);
12246rb_str_symname_p(
VALUE sym)
12254 enc = STR_ENC_GET(sym);
12255 ptr = RSTRING_PTR(sym);
12256 len = RSTRING_LEN(sym);
12257 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12265rb_str_quote_unprintable(
VALUE str)
12270 rb_encoding *resenc;
12275 enc = STR_ENC_GET(str);
12276 ptr = RSTRING_PTR(str);
12277 len = RSTRING_LEN(str);
12278 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12279 !sym_printable(ptr, ptr +
len, enc)) {
12280 return rb_str_escape(str);
12286rb_id_quote_unprintable(
ID id)
12288 VALUE str = rb_id2str(
id);
12289 if (!rb_str_symname_p(str)) {
12290 return rb_str_escape(str);
12308sym_inspect(
VALUE sym)
12315 if (!rb_str_symname_p(str)) {
12317 len = RSTRING_LEN(str);
12318 rb_str_resize(str,
len + 1);
12319 dest = RSTRING_PTR(str);
12320 memmove(dest + 1, dest,
len);
12323 rb_encoding *enc = STR_ENC_GET(str);
12324 VALUE orig_str = str;
12326 len = RSTRING_LEN(orig_str);
12327 str = rb_enc_str_new(0,
len + 1, enc);
12330 ptr = RSTRING_PTR(orig_str);
12331 dest = RSTRING_PTR(str);
12332 memcpy(dest + 1, ptr,
len);
12352rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12357 rb_raise(rb_eArgError,
"no receiver given");
12460 return rb_str_match(
rb_sym2str(sym), other);
12475sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12477 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12490sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12492 return rb_str_match_m_p(argc, argv, sym);
12510 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12521sym_length(
VALUE sym)
12535sym_empty(
VALUE sym)
12569sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12585sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12601sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12615sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12617 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12630sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12632 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12644sym_encoding(
VALUE sym)
12650string_for_symbol(
VALUE name)
12655 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12669 name = string_for_symbol(name);
12670 return rb_intern_str(name);
12679 name = string_for_symbol(name);
12703 return rb_fstring(str);
12709 struct RString fake_str = {RBASIC_INIT};
12710 int encidx = ENCINDEX_US_ASCII;
12712 if (
len > 0 && search_nonascii(ptr, ptr +
len)) {
12713 encidx = ENCINDEX_ASCII_8BIT;
12716 VALUE str = setup_fake_str(&fake_str, ptr,
len, encidx);
12718 return register_fstring(str,
true,
false);
12730 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12731 rb_enc_autoload(enc);
12734 struct RString fake_str = {RBASIC_INIT};
12735 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
true,
false);
12739rb_enc_literal_str(
const char *ptr,
long len, rb_encoding *enc)
12741 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12742 rb_enc_autoload(enc);
12745 struct RString fake_str = {RBASIC_INIT};
12746 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
true,
true);
12757#if USE_YJIT || USE_ZJIT
12759rb_jit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12764 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12765 rb_str_buf_cat_byte(str, (
char) code);
12775fstring_set_class_i(
VALUE *str,
void *data)
12779 return ST_CONTINUE;
12787 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12954 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
#define ISSPACE
@old{rb_isspace}
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define ISDIGIT
@old{rb_isdigit}
#define ISALPHA
@old{rb_isalpha}
#define TOLOWER
@old{rb_tolower}
#define ISPRINT
@old{rb_isprint}
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define ALLOCV
Old name of RB_ALLOCV.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_hash_new(void)
Creates a new, empty hash object.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
int ruby_thread_has_gvl_p(void)
Whether the current thread is holding the GVL.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.