1 | /*************************************************
|
---|
2 | * libucp - Unicode Property Table handler *
|
---|
3 | *************************************************/
|
---|
4 |
|
---|
5 | /* Internal header file defining the layout of compact nodes in the tree. */
|
---|
6 |
|
---|
7 | typedef struct cnode {
|
---|
8 | unsigned short int f0;
|
---|
9 | unsigned short int f1;
|
---|
10 | unsigned short int f2;
|
---|
11 | } cnode;
|
---|
12 |
|
---|
13 | /* Things for the f0 field */
|
---|
14 |
|
---|
15 | #define f0_leftexists 0x8000 /* Left child exists */
|
---|
16 | #define f0_typemask 0x3f00 /* Type bits */
|
---|
17 | #define f0_typeshift 8 /* Type shift */
|
---|
18 | #define f0_chhmask 0x00ff /* Character high bits */
|
---|
19 |
|
---|
20 | /* Things for the f2 field */
|
---|
21 |
|
---|
22 | #define f2_rightmask 0xf000 /* Mask for right offset bits */
|
---|
23 | #define f2_rightshift 12 /* Shift for right offset */
|
---|
24 | #define f2_casemask 0x0fff /* Mask for case offset */
|
---|
25 |
|
---|
26 | /* The tree consists of a vector of structures of type cnode, with the root
|
---|
27 | node as the first element. The three short ints (16-bits) are used as follows:
|
---|
28 |
|
---|
29 | (f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node
|
---|
30 | is the next node in the vector.
|
---|
31 | (2) The 0x4000 bits of f0 is spare.
|
---|
32 | (3) The 0x3f00 bits of f0 contain the character type; this is a number
|
---|
33 | defined by the enumeration in ucp.h (e.g. ucp_Lu).
|
---|
34 | (4) The bottom 8 bits of f0 contain the most significant byte of the
|
---|
35 | character's 24-bit codepoint.
|
---|
36 |
|
---|
37 | (f1) (1) The f1 field contains the two least significant bytes of the
|
---|
38 | codepoint.
|
---|
39 |
|
---|
40 | (f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this
|
---|
41 | node. Otherwise, they contain one plus the exponent of the power of
|
---|
42 | two of the offset to the right node (e.g. a value of 3 means 8). The
|
---|
43 | units of the offset are node items.
|
---|
44 |
|
---|
45 | (2) The 0x0fff bits of f2 contain the signed offset from this character to
|
---|
46 | its alternate cased value. They are zero if there is no such
|
---|
47 | character.
|
---|
48 |
|
---|
49 |
|
---|
50 | -----------------------------------------------------------------------------
|
---|
51 | ||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) ||
|
---|
52 | -----------------------------------------------------------------------------
|
---|
53 | | | |
|
---|
54 | | |-> spare |
|
---|
55 | | exponent of right
|
---|
56 | |-> left child exists child offset
|
---|
57 |
|
---|
58 |
|
---|
59 | The upper/lower casing information is set only for characters that come in
|
---|
60 | pairs. There are (at present) four non-one-to-one mappings in the Unicode data.
|
---|
61 | These are ignored. They are:
|
---|
62 |
|
---|
63 | 1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)
|
---|
64 | 2126 Ohm
|
---|
65 | 212A Kelvin
|
---|
66 | 212B Angstrom
|
---|
67 |
|
---|
68 | Certainly for the last three, having an alternate case would seem to be a
|
---|
69 | mistake. I don't know any Greek, so cannot comment on the first one.
|
---|
70 |
|
---|
71 |
|
---|
72 | When searching the tree, proceed as follows:
|
---|
73 |
|
---|
74 | (1) Start at the first node.
|
---|
75 |
|
---|
76 | (2) Extract the character value from f1 and the bottom 8 bits of f0;
|
---|
77 |
|
---|
78 | (3) Compare with the character being sought. If equal, we are done.
|
---|
79 |
|
---|
80 | (4) If the test character is smaller, inspect the f0_leftexists flag. If it is
|
---|
81 | not set, the character is not in the tree. If it is set, move to the next
|
---|
82 | node, and go to (2).
|
---|
83 |
|
---|
84 | (5) If the test character is bigger, extract the f2_rightmask bits from f2, and
|
---|
85 | shift them right by f2_rightshift. If the result is zero, the character is
|
---|
86 | not in the tree. Otherwise, calculate the number of nodes to skip by
|
---|
87 | shifting the value 1 left by this number minus one. Go to (2).
|
---|
88 | */
|
---|
89 |
|
---|
90 |
|
---|
91 | /* End of internal.h */
|
---|