Changeset 27419 in webkit for trunk/JavaScriptCore/pcre/pcre_internal.h
- Timestamp:
- Nov 3, 2007, 10:22:44 PM (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/pcre/pcre_internal.h
r27405 r27419 104 104 #include <string.h> 105 105 106 #ifndef PCRE_SPY107 #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */108 #endif109 110 /* We need to have types that specify unsigned 16-bit and 32-bit integers. We111 cannot determine these outside the compilation (e.g. by running a program as112 part of "configure") because PCRE is often cross-compiled for use on other113 systems. Instead we make use of the maximum sizes that are available at114 preprocessor time in standard C environments. */115 116 #if USHRT_MAX == 65535117 typedef unsigned short pcre_uint16;118 #elif UINT_MAX == 65535119 typedef unsigned int pcre_uint16;120 #else121 #error Cannot determine a type for 16-bit unsigned integers122 #endif123 124 #if UINT_MAX == 4294967295125 typedef unsigned int pcre_uint32;126 #elif ULONG_MAX == 4294967295127 typedef unsigned long int pcre_uint32;128 #else129 #error Cannot determine a type for 32-bit unsigned integers130 #endif131 132 /* All character handling must be done as unsigned characters. Otherwise there133 are problems with top-bit-set characters and functions such as isspace().134 However, we leave the interface to the outside world as char *, because that135 should make things easier for callers. We define a short type for unsigned char136 to save lots of typing. I tried "uchar", but it causes problems on Digital137 Unix, where it is defined in sys/types, so use "uschar" instead. */138 139 typedef unsigned char uschar;140 141 /* When PCRE is compiled as a C++ library, the subject pointer can be replaced142 with a custom type. This makes it possible, for example, to allow pcre_exec()143 to process subject strings that are discontinuous by using a smart pointer144 class. It must always be possible to inspect all of the subject string in145 pcre_exec() because of the way it backtracks. Two macros are required in the146 normal case, for sign-unspecified and unsigned char pointers. The former is147 used for the external interface and appears in pcre.h, which is why its name148 must begin with PCRE_. */149 150 #ifdef CUSTOM_SUBJECT_PTR151 #define PCRE_SPTR CUSTOM_SUBJECT_PTR152 #define USPTR CUSTOM_SUBJECT_PTR153 #else154 #define USPTR const pcre_uchar *155 #endif156 157 106 /* Include the public PCRE header and the definitions of UCP character property 158 107 values. */ 159 108 160 109 #include "pcre.h" 161 #include "ucp.h" 162 163 /* Unsigned version of pcre_char. */ 164 #if PCRE_UTF16 165 typedef pcre_char pcre_uchar; 166 #else 167 typedef unsigned char pcre_uchar; 168 #endif 169 170 /* When compiling for use with the Virtual Pascal compiler, these functions 171 need to have their names changed. PCRE must be compiled with the -DVPCOMPAT 172 option on the command line. */ 173 174 #ifdef VPCOMPAT 175 #define strncmp(s1,s2,m) _strncmp(s1,s2,m) 176 #define memcpy(d,s,n) _memcpy(d,s,n) 177 #define memmove(d,s,n) _memmove(d,s,n) 178 #define memset(s,c,n) _memset(s,c,n) 179 #else /* VPCOMPAT */ 180 181 /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), 182 define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY 183 is set. Otherwise, include an emulating function for those systems that have 184 neither (there some non-Unix environments where this is the case). This assumes 185 that all calls to memmove are moving strings upwards in store, which is the 186 case in PCRE. */ 187 188 #if ! HAVE_MEMMOVE 189 #undef memmove /* some systems may have a macro */ 190 #if HAVE_BCOPY 191 #define memmove(a, b, c) bcopy(b, a, c) 192 #else /* HAVE_BCOPY */ 193 void * 194 pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) 195 { 196 size_t i; 197 dest += n; 198 src += n; 199 for (i = 0; i < n; ++i) *(--dest) = *(--src); 200 return dest; 201 } 202 #define memmove(a, b, c) pcre_memmove(a, b, c) 203 #endif /* not HAVE_BCOPY */ 204 #endif /* not HAVE_MEMMOVE */ 205 #endif /* not VPCOMPAT */ 206 110 111 typedef unsigned short pcre_uint16; 112 typedef unsigned pcre_uint32; 113 typedef unsigned char uschar; 114 115 typedef JSRegExp pcre; 116 117 typedef JSRegExpChar pcre_char; 118 typedef JSRegExpChar pcre_uchar; 119 typedef const JSRegExpChar* USPTR; 120 121 /* Temporary fastMalloc/fastFree until we port to C++. */ 122 #ifdef __cplusplus 123 extern "C" { 124 #endif 125 extern void* (*pcre_malloc)(size_t); 126 extern void (*pcre_free)(void*); 127 #ifdef __cplusplus 128 } /* extern "C" */ 129 #endif 207 130 208 131 /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored … … 285 208 byte. The macros for character handling generate simple sequences when used in 286 209 byte-mode, and more complicated ones for UTF-8 characters. */ 287 288 #ifndef SUPPORT_UTF8289 #define GETCHAR(c, eptr) c = *eptr;290 #define GETCHARTEST(c, eptr) c = *eptr;291 #define GETCHARINC(c, eptr) c = *eptr++;292 #define GETCHARINCTEST(c, eptr) c = *eptr++;293 #define GETCHARLEN(c, eptr, len) c = *eptr;294 #define BACKCHAR(eptr)295 296 #else /* SUPPORT_UTF8 */297 210 298 211 /* Get the next UTF-8 character, not advancing the pointer, incrementing length … … 332 245 } 333 246 334 #if PCRE_UTF16335 336 247 #define LEAD_OFFSET (0xd800 - (0x10000 >> 10)) 337 248 #define SURROGATE_OFFSET (0x10000 - (0xd800 << 10) - 0xdc00) … … 376 287 #define ISMIDCHAR(c) IS_TRAILING_SURROGATE(c) 377 288 378 #else379 380 /* Get the next UTF-8 character, not advancing the pointer. This is called when381 we know we are in UTF-8 mode. */382 383 #define GETCHAR(c, eptr) \384 c = *eptr; \385 if ((c & 0xc0) == 0xc0) \386 { \387 int gcii; \388 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \389 int gcss = 6*gcaa; \390 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \391 for (gcii = 1; gcii <= gcaa; gcii++) \392 { \393 gcss -= 6; \394 c |= (eptr[gcii] & 0x3f) << gcss; \395 } \396 }397 398 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the399 pointer. */400 401 #define GETCHARTEST(c, eptr) \402 c = *eptr; \403 if (utf8 && (c & 0xc0) == 0xc0) \404 { \405 int gcii; \406 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \407 int gcss = 6*gcaa; \408 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \409 for (gcii = 1; gcii <= gcaa; gcii++) \410 { \411 gcss -= 6; \412 c |= (eptr[gcii] & 0x3f) << gcss; \413 } \414 }415 416 /* Get the next UTF-8 character, advancing the pointer. This is called when we417 know we are in UTF-8 mode. */418 419 #define GETCHARINC(c, eptr) GETUTF8CHARINC(c, eptr)420 421 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */422 423 #define GETCHARINCTEST(c, eptr) \424 c = *eptr++; \425 if (utf8 && (c & 0xc0) == 0xc0) \426 { \427 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \428 int gcss = 6*gcaa; \429 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \430 while (gcaa-- > 0) \431 { \432 gcss -= 6; \433 c |= (*eptr++ & 0x3f) << gcss; \434 } \435 }436 437 #define GETCHARLEN(c, eptr) GETUTF8CHARLEN(c, eptr)438 439 /* Return 1 if not the start of a character. */440 441 #define ISMIDCHAR(c) (((c) & 0xc0) == 0x80)442 443 #endif444 445 289 /* If the pointer is not at the start of a character, move it back until 446 290 it is. Called only in UTF-8 mode. */ … … 448 292 #define BACKCHAR(eptr) while(ISMIDCHAR(*eptr)) eptr--; 449 293 450 #endif451 452 294 453 295 /* In case there is no definition of offsetof() provided - though any proper … … 458 300 #endif 459 301 460 461 /* These are the public options that can change during matching. */462 463 #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)464 302 465 303 /* Private options flags start at the most significant end of the four bytes, … … 471 309 #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ 472 310 #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ 473 #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */ 474 #define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */ 475 476 /* Options for the "extra" block produced by pcre_study(). */ 477 478 #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ 479 480 /* Masks for identifying the public options that are permitted at compile 481 time, run time, or study time, respectively. */ 482 483 #define PUBLIC_OPTIONS \ 484 (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ 485 PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ 486 PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE) 487 488 #define PUBLIC_EXEC_OPTIONS \ 489 (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ 490 PCRE_PARTIAL) 491 492 #define PUBLIC_DFA_EXEC_OPTIONS \ 493 (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ 494 PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART) 495 496 #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ 497 498 /* Magic number to provide a small check against being handed junk. Also used 499 to detect whether a pattern was compiled on a host of different endianness. */ 500 501 #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ 311 #define PCRE_ANCHORED 0x02000000 /* can't use partial with this regex */ 312 #define PCRE_CASELESS JS_REGEXP_CASELESS 313 #define PCRE_MULTILINE JS_REGEXP_MULTILINE 502 314 503 315 /* Negative values for the firstchar and reqchar variables */ … … 537 349 538 350 #ifndef ESC_n 539 #define ESC_n NEWLINE351 #define ESC_n '\n' 540 352 #endif 541 353 … … 601 413 /* Values corresponding to backslashed metacharacters */ 602 414 603 OP_SOD, /* 1 Start of data: \A */604 OP_SOM, /* 2 Start of match (subject + offset): \G */415 xOP_SOD, /* 1 Start of data: \A */ 416 xOP_SOM, /* 2 Start of match (subject + offset): \G */ 605 417 OP_NOT_WORD_BOUNDARY, /* 3 \B */ 606 418 OP_WORD_BOUNDARY, /* 4 \b */ … … 612 424 OP_WORDCHAR, /* 10 \w */ 613 425 OP_ANY, /* 11 Match any character */ 614 OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */615 OP_NOTPROP, /* 13 \P (not Unicode property) */616 OP_PROP, /* 14 \p (Unicode property) */617 OP_EXTUNI, /* 15 \X (extended Unicode sequence */618 OP_EODN, /* 16 End of data or \n at end of data: \Z. */619 OP_EOD, /* 17 End of data: \z */620 621 OP_OPT, /* 18 Set runtime options */426 xOP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ 427 xOP_NOTPROP, /* 13 \P (not Unicode property) */ 428 xOP_PROP, /* 14 \p (Unicode property) */ 429 xOP_EXTUNI, /* 15 \X (extended Unicode sequence */ 430 xOP_EODN, /* 16 End of data or \n at end of data: \Z. */ 431 xOP_EOD, /* 17 End of data: \z */ 432 433 xOP_OPT, /* 18 Set runtime options */ 622 434 OP_CIRC, /* 19 Start of line - varies with multiline switch */ 623 435 OP_DOLL, /* 20 End of line - varies with multiline switch */ … … 674 486 675 487 OP_REF, /* 62 Match a back reference */ 676 OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */677 OP_CALLOUT, /* 64 Call out to external function if provided */488 xOP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */ 489 xOP_CALLOUT, /* 64 Call out to external function if provided */ 678 490 679 491 OP_ALT, /* 65 Start of alternation */ … … 686 498 OP_ASSERT, /* 69 Positive lookahead */ 687 499 OP_ASSERT_NOT, /* 70 Negative lookahead */ 688 OP_ASSERTBACK, /* 71 Positive lookbehind */689 OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */690 OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */500 xOP_ASSERTBACK, /* 71 Positive lookbehind */ 501 xOP_ASSERTBACK_NOT, /* 72 Negative lookbehind */ 502 xOP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */ 691 503 692 504 /* ONCE and COND must come after the assertions, with ONCE first, as there's … … 694 506 695 507 OP_ONCE, /* 74 Once matched, don't back up into the subpattern */ 696 OP_COND, /* 75 Conditional group */697 OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */508 xOP_COND, /* 75 Conditional group */ 509 xOP_CREF, /* 76 Used to hold an extraction string number (cond ref) */ 698 510 699 511 OP_BRAZERO, /* 77 These two must remain together and in this */ … … 824 636 825 637 typedef struct real_pcre { 826 pcre_uint32 magic_number;827 638 pcre_uint32 size; /* Total that was malloced */ 828 639 pcre_uint32 options; 829 pcre_uint32 dummy1; /* For future use, maybe */830 640 831 641 pcre_uint16 top_bracket; … … 867 677 unsigned int backref_map; /* Bitmap of low back refs */ 868 678 int req_varyopt; /* "After variable item" flag for reqbyte */ 869 BOOL nopartial; /* Set TRUE if partial won't work */870 679 } compile_data; 871 680 … … 877 686 uschar *current; 878 687 } branch_chain; 879 880 /* Structure for items in a linked list that represents an explicit recursive881 call within the pattern. */882 883 typedef struct recursion_info {884 struct recursion_info *prevrec; /* Previous recursion record (or NULL) */885 int group_num; /* Number of group that was called */886 const uschar *after_call; /* "Return value": points after the call in the expr */887 USPTR save_start; /* Old value of md->start_match */888 int *offset_save; /* Pointer to start of saved offsets */889 int saved_max; /* Number of saved offsets */890 } recursion_info;891 688 892 689 /* When compiling in a mode that doesn't use recursive calls to match(), … … 905 702 typedef struct match_data { 906 703 unsigned long int match_call_count; /* As it says */ 907 unsigned long int match_limit; /* As it says */908 unsigned long int match_limit_recursion; /* As it says */909 704 int *offset_vector; /* Offset vector */ 910 705 int offset_end; /* One past the end */ … … 913 708 const uschar *ctypes; /* Points to table of type maps */ 914 709 BOOL offset_overflow; /* Set if too many extractions */ 915 BOOL notbol; /* NOTBOL flag */916 BOOL noteol; /* NOTEOL flag */917 BOOL utf8; /* UTF8 flag */918 BOOL endonly; /* Dollar not before final \n */919 BOOL notempty; /* Empty string match not wanted */920 BOOL partial; /* PARTIAL flag */921 BOOL hitend; /* Hit the end of the subject at some point */922 const uschar *start_code; /* For use when recursing */923 710 USPTR start_subject; /* Start of the subject string */ 924 711 USPTR end_subject; /* End of the subject string */ … … 926 713 USPTR end_match_ptr; /* Subject position at end match */ 927 714 int end_offset_top; /* Highwater mark at end of match */ 928 int capture_last; /* Most recent capture number */929 int start_offset; /* The start offset value */930 recursion_info *recursive; /* Linked list of recursion data */931 void *callout_data; /* To pass back to callouts */932 715 struct heapframe *thisframe; /* Used only when compiling for no recursion */ 716 BOOL multiline; 717 BOOL caseless; 933 718 } match_data; 934 719 935 /* A similar structure is used for the same purpose by the DFA matching936 functions. */937 938 typedef struct dfa_match_data {939 const uschar *start_code; /* Start of the compiled pattern */940 const pcre_uchar *start_subject; /* Start of the subject string */941 const pcre_uchar *end_subject; /* End of subject string */942 const uschar *tables; /* Character tables */943 int moptions; /* Match options */944 int poptions; /* Pattern options */945 void *callout_data; /* To pass back to callouts */946 } dfa_match_data;947 948 720 /* Bit definitions for entries in the pcre_ctypes table. */ 949 721 950 722 #define ctype_space 0x01 951 #define ctype_letter 0x02952 723 #define ctype_digit 0x04 953 724 #define ctype_xdigit 0x08 954 725 #define ctype_word 0x10 /* alphameric or '_' */ 955 #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */956 726 957 727 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set 958 728 of bits for a class map. Some classes are built by combining these tables. */ 959 729 960 #define cbit_space 0 /* [:space:] or \s */ 961 #define cbit_xdigit 32 /* [:xdigit:] */ 962 #define cbit_digit 64 /* [:digit:] or \d */ 963 #define cbit_upper 96 /* [:upper:] */ 964 #define cbit_lower 128 /* [:lower:] */ 965 #define cbit_word 160 /* [:word:] or \w */ 966 #define cbit_graph 192 /* [:graph:] */ 967 #define cbit_print 224 /* [:print:] */ 968 #define cbit_punct 256 /* [:punct:] */ 969 #define cbit_cntrl 288 /* [:cntrl:] */ 970 #define cbit_length 320 /* Length of the cbits table */ 730 #define cbit_space 0 /* \s */ 731 #define cbit_digit 32 /* \d */ 732 #define cbit_word 64 /* \w */ 733 #define cbit_length 96 /* Length of the cbits table */ 971 734 972 735 /* Offsets of the various tables from the base tables pointer, and … … 977 740 #define cbits_offset 512 978 741 #define ctypes_offset (cbits_offset + cbit_length) 979 #define tables_length (ctypes_offset + 256)742 #define tables_length (ctypes_offset + 128) 980 743 981 744 /* Layout of the UCP type table that translates property names into types and … … 1001 764 extern const int _pcre_utf8_table1_size; 1002 765 1003 extern const ucp_type_table _pcre_utt[];1004 extern const int _pcre_utt_size;1005 1006 766 extern const uschar _pcre_default_tables[]; 1007 767 … … 1014 774 1015 775 extern int _pcre_ord2utf8(int, uschar *); 1016 extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,1017 const pcre_study_data *, pcre_study_data *);1018 extern int _pcre_ucp_findprop(const int, int *, int *);1019 776 extern int _pcre_ucp_othercase(const int); 1020 extern int _pcre_valid_utf8(const uschar *, int);1021 777 extern BOOL _pcre_xclass(int, const uschar *); 1022 778 1023 #if JAVASCRIPT1024 779 #define IS_NEWLINE(nl) ((nl) == 0xA || (nl) == 0xD || (nl) == 0x2028 || (nl) == 0x2029) 1025 #else1026 #define IS_NEWLINE(nl) ((nl) == NEWLINE)1027 #endif1028 780 1029 781 #endif
Note:
See TracChangeset
for help on using the changeset viewer.