summaryrefslogtreecommitdiff
path: root/libuxre/_collelem.c
blob: c5dbb053aabe9e0c524d0d8012b16df9f5bdbe3e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/*
 * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
 *
 * Sccsid @(#)_collelem.c	1.4 (gritter) 10/18/03
 */
/*  UNIX(R) Regular Expresssion Library
 *
 *  Note: Code is released under the GNU LGPL
 *
 *  Copyright (C) 2001 Caldera International, Inc.
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to:
 *        Free Software Foundation, Inc.
 *        59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*	#include "synonyms.h"	*/
#include "colldata.h"
#include <stddef.h>

#define CCE(p)	((const CollElem *)(p))
#define CCM(p)	((const CollMult *)(p))

LIBUXRE_STATIC const CollElem *
libuxre_collelem(struct lc_collate *col, CollElem *spare, wchar_t wc)
{
	const char *tbl;
	size_t hi, lo, cur;
	const CollMult *cmp;
	const CollElem *cep;
	long diff;
	int sz;

	/*
	* ELEM_ENCODED is returned when the collation is entirely
	* based on the encoded value of the character.
	*/
	if (col == 0 || col->flags & CHF_ENCODED
		|| (tbl = (const char *)col->maintbl) == 0)
	{
		return ELEM_ENCODED;
	}
	if ((wuchar_type)wc <= UCHAR_MAX)
	{
	indexed:;
		cep = CCE(&tbl[(wuchar_type)wc * col->elemsize]);
		if (cep->weight[0] == WGHT_SPECIAL)
			return ELEM_BADCHAR;
		return cep;
	}
	if (col->flags & CHF_INDEXED)
	{
		if ((wuchar_type)wc >= col->nmain)
			return ELEM_BADCHAR;
		goto indexed;
	}
	/*
	* Binary search for a match.  Could speed up the search if
	* some interpolation was used, but keep it simple for now.
	* Note that this is actually a table of CollMult's.
	*
	* To save space in the file, sequences of similar elements
	* are sometimes compressed into a single CollMult that
	* describes many entries.  This is denoted by a subnbeg
	* with the SUBN_SPECIAL bit set.  The rest of the bits give
	* the range covered by this entry.
	*/
	sz = col->elemsize + (sizeof(CollMult) - sizeof(CollElem));
	tbl += (1 + UCHAR_MAX) * col->elemsize;
	lo = 0;
	hi = col->nmain - UCHAR_MAX;
	while (lo < hi)
	{
		if ((cur = (hi + lo) >> 1) < lo)   /* hi+lo overflowed */
			cur |= ~(~(size_t)0 >> 1); /* lost high order bit */
		cmp = CCM(&tbl[cur * sz]);
		if ((diff = wc - cmp->ch) < 0)
			hi = cur;
		else if (cmp->elem.subnbeg & SUBN_SPECIAL)
		{
			if (diff > (long)(cmp->elem.subnbeg & ~SUBN_SPECIAL))
				lo = cur + 1;
			else /* create an entry from the sequence in spare */
			{
				spare->multbeg = cmp->elem.multbeg;
				spare->subnbeg = 0;
				spare->weight[0] = cmp->elem.weight[0] + diff;
				for (lo = 1; lo < col->nweight; lo++)
				{
					wuchar_type w;

					if ((w = cmp->elem.weight[lo])
						== WGHT_SPECIAL)
					{
						w = spare->weight[0];
					}
					spare->weight[lo] = w;
				}
				return spare;
			}
		}
		else if (diff == 0)
			return &cmp->elem;
		else
			lo = cur + 1;
	}
	return ELEM_BADCHAR;
}