From 652388f3fb610f85c3de6b7a478728b7077cf4f5 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 11 Dec 2008 15:31:48 -0800 Subject: [PATCH 1/2] Moving all lib/libspl contexts to linux-libspl branch --- configure.ac | 1 - lib/Makefile.am | 2 +- lib/libspl/Makefile.am | 14 - lib/libspl/include/sys/list.h | 67 - lib/libspl/include/sys/list_impl.h | 53 - lib/libspl/list.c | 245 ---- lib/libspl/mkdirp.c | 212 --- lib/libspl/strlcat.c | 59 - lib/libspl/strlcpy.c | 55 - lib/libspl/strnlen.c | 47 - lib/libspl/u8_textprep.c | 2133 ---------------------------- 11 files changed, 1 insertion(+), 2887 deletions(-) delete mode 100644 lib/libspl/Makefile.am delete mode 100644 lib/libspl/include/sys/list.h delete mode 100644 lib/libspl/include/sys/list_impl.h delete mode 100644 lib/libspl/list.c delete mode 100644 lib/libspl/mkdirp.c delete mode 100644 lib/libspl/strlcat.c delete mode 100644 lib/libspl/strlcpy.c delete mode 100644 lib/libspl/strnlen.c delete mode 100644 lib/libspl/u8_textprep.c diff --git a/configure.ac b/configure.ac index 2400bb47d9..845ef93d5a 100644 --- a/configure.ac +++ b/configure.ac @@ -106,7 +106,6 @@ AC_CONFIG_FILES([ Makefile doc/Makefile scripts/Makefile lib/Makefile - lib/libspl/Makefile lib/libavl/Makefile lib/libnvpair/Makefile lib/libuutil/Makefile diff --git a/lib/Makefile.am b/lib/Makefile.am index df55aacdc5..e5607a4bbc 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -1 +1 @@ -SUBDIRS = libspl libavl libnvpair libuutil libzcommon libzpool libzfs +SUBDIRS = libavl libnvpair libuutil libzcommon libzpool libzfs diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am deleted file mode 100644 index 451995830f..0000000000 --- a/lib/libspl/Makefile.am +++ /dev/null @@ -1,14 +0,0 @@ -include $(top_srcdir)/config/Rules.am - -DEFAULT_INCLUDES = -I${libdir}/libspl/include - -lib_LTLIBRARIES = libspl.la - -libspl_la_SOURCES = list.c \ - mkdirp.c \ - strlcat.c \ - strlcpy.c \ - strnlen.c \ - u8_textprep.c \ - include/sys/list.h \ - include/sys/list_impl.h diff --git a/lib/libspl/include/sys/list.h b/lib/libspl/include/sys/list.h deleted file mode 100644 index 8339b6226d..0000000000 --- a/lib/libspl/include/sys/list.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_LIST_H -#define _SYS_LIST_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct list_node list_node_t; -typedef struct list list_t; - -void list_create(list_t *, size_t, size_t); -void list_destroy(list_t *); - -void list_insert_after(list_t *, void *, void *); -void list_insert_before(list_t *, void *, void *); -void list_insert_head(list_t *, void *); -void list_insert_tail(list_t *, void *); -void list_remove(list_t *, void *); -void *list_remove_head(list_t *); -void *list_remove_tail(list_t *); -void list_move_tail(list_t *, list_t *); - -void *list_head(list_t *); -void *list_tail(list_t *); -void *list_next(list_t *, void *); -void *list_prev(list_t *, void *); -int list_is_empty(list_t *); - -void list_link_init(list_node_t *); -void list_link_replace(list_node_t *, list_node_t *); - -int list_link_active(list_node_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_LIST_H */ diff --git a/lib/libspl/include/sys/list_impl.h b/lib/libspl/include/sys/list_impl.h deleted file mode 100644 index 9c42f88320..0000000000 --- a/lib/libspl/include/sys/list_impl.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_LIST_IMPL_H -#define _SYS_LIST_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct list_node { - struct list_node *list_next; - struct list_node *list_prev; -}; - -struct list { - size_t list_size; - size_t list_offset; - struct list_node list_head; -}; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_LIST_IMPL_H */ diff --git a/lib/libspl/list.c b/lib/libspl/list.c deleted file mode 100644 index e8db13a5cf..0000000000 --- a/lib/libspl/list.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Generic doubly-linked list implementation - */ - -#include -#include -#include -#include -#include - -#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) -#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) -#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) - -#define list_insert_after_node(list, node, object) { \ - list_node_t *lnew = list_d2l(list, object); \ - lnew->list_prev = (node); \ - lnew->list_next = (node)->list_next; \ - (node)->list_next->list_prev = lnew; \ - (node)->list_next = lnew; \ -} - -#define list_insert_before_node(list, node, object) { \ - list_node_t *lnew = list_d2l(list, object); \ - lnew->list_next = (node); \ - lnew->list_prev = (node)->list_prev; \ - (node)->list_prev->list_next = lnew; \ - (node)->list_prev = lnew; \ -} - -#define list_remove_node(node) \ - (node)->list_prev->list_next = (node)->list_next; \ - (node)->list_next->list_prev = (node)->list_prev; \ - (node)->list_next = (node)->list_prev = NULL - -void -list_create(list_t *list, size_t size, size_t offset) -{ - ASSERT(list); - ASSERT(size > 0); - ASSERT(size >= offset + sizeof (list_node_t)); - - list->list_size = size; - list->list_offset = offset; - list->list_head.list_next = list->list_head.list_prev = - &list->list_head; -} - -void -list_destroy(list_t *list) -{ - list_node_t *node = &list->list_head; - - ASSERT(list); - ASSERT(list->list_head.list_next == node); - ASSERT(list->list_head.list_prev == node); - - node->list_next = node->list_prev = NULL; -} - -void -list_insert_after(list_t *list, void *object, void *nobject) -{ - if (object == NULL) { - list_insert_head(list, nobject); - } else { - list_node_t *lold = list_d2l(list, object); - list_insert_after_node(list, lold, nobject); - } -} - -void -list_insert_before(list_t *list, void *object, void *nobject) -{ - if (object == NULL) { - list_insert_tail(list, nobject); - } else { - list_node_t *lold = list_d2l(list, object); - list_insert_before_node(list, lold, nobject); - } -} - -void -list_insert_head(list_t *list, void *object) -{ - list_node_t *lold = &list->list_head; - list_insert_after_node(list, lold, object); -} - -void -list_insert_tail(list_t *list, void *object) -{ - list_node_t *lold = &list->list_head; - list_insert_before_node(list, lold, object); -} - -void -list_remove(list_t *list, void *object) -{ - list_node_t *lold = list_d2l(list, object); - ASSERT(!list_empty(list)); - ASSERT(lold->list_next != NULL); - list_remove_node(lold); -} - -void * -list_remove_head(list_t *list) -{ - list_node_t *head = list->list_head.list_next; - if (head == &list->list_head) - return (NULL); - list_remove_node(head); - return (list_object(list, head)); -} - -void * -list_remove_tail(list_t *list) -{ - list_node_t *tail = list->list_head.list_prev; - if (tail == &list->list_head) - return (NULL); - list_remove_node(tail); - return (list_object(list, tail)); -} - -void * -list_head(list_t *list) -{ - if (list_empty(list)) - return (NULL); - return (list_object(list, list->list_head.list_next)); -} - -void * -list_tail(list_t *list) -{ - if (list_empty(list)) - return (NULL); - return (list_object(list, list->list_head.list_prev)); -} - -void * -list_next(list_t *list, void *object) -{ - list_node_t *node = list_d2l(list, object); - - if (node->list_next != &list->list_head) - return (list_object(list, node->list_next)); - - return (NULL); -} - -void * -list_prev(list_t *list, void *object) -{ - list_node_t *node = list_d2l(list, object); - - if (node->list_prev != &list->list_head) - return (list_object(list, node->list_prev)); - - return (NULL); -} - -/* - * Insert src list after dst list. Empty src list thereafter. - */ -void -list_move_tail(list_t *dst, list_t *src) -{ - list_node_t *dstnode = &dst->list_head; - list_node_t *srcnode = &src->list_head; - - ASSERT(dst->list_size == src->list_size); - ASSERT(dst->list_offset == src->list_offset); - - if (list_empty(src)) - return; - - dstnode->list_prev->list_next = srcnode->list_next; - srcnode->list_next->list_prev = dstnode->list_prev; - dstnode->list_prev = srcnode->list_prev; - srcnode->list_prev->list_next = dstnode; - - /* empty src list */ - srcnode->list_next = srcnode->list_prev = srcnode; -} - -void -list_link_replace(list_node_t *lold, list_node_t *lnew) -{ - ASSERT(list_link_active(lold)); - ASSERT(!list_link_active(lnew)); - - lnew->list_next = lold->list_next; - lnew->list_prev = lold->list_prev; - lold->list_prev->list_next = lnew; - lold->list_next->list_prev = lnew; - lold->list_next = lold->list_prev = NULL; -} - -void -list_link_init(list_node_t *link) -{ - link->list_next = NULL; - link->list_prev = NULL; -} - -int -list_link_active(list_node_t *link) -{ - return (link->list_next != NULL); -} - -int -list_is_empty(list_t *list) -{ - return (list_empty(list)); -} diff --git a/lib/libspl/mkdirp.c b/lib/libspl/mkdirp.c deleted file mode 100644 index 9c81f2a0b8..0000000000 --- a/lib/libspl/mkdirp.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Creates directory and it's parents if the parents do not - * exist yet. - * - * Returns -1 if fails for reasons other than non-existing - * parents. - * Does NOT simplify pathnames with . or .. in them. - */ - -#include -#include -#include -#include -#include -#include -#include - -static char *simplify(const char *str); - -int -mkdirp(const char *d, mode_t mode) -{ - char *endptr, *ptr, *slash, *str; - - str = simplify(d); - - /* If space couldn't be allocated for the simplified names, return. */ - - if (str == NULL) - return (-1); - - /* Try to make the directory */ - - if (mkdir(str, mode) == 0) { - free(str); - return (0); - } - if (errno != ENOENT) { - free(str); - return (-1); - } - endptr = strrchr(str, '\0'); - slash = strrchr(str, '/'); - - /* Search upward for the non-existing parent */ - - while (slash != NULL) { - - ptr = slash; - *ptr = '\0'; - - /* If reached an existing parent, break */ - - if (access(str, F_OK) == 0) - break; - - /* If non-existing parent */ - - else { - slash = strrchr(str, '/'); - - /* If under / or current directory, make it. */ - - if (slash == NULL || slash == str) { - if (mkdir(str, mode) != 0 && errno != EEXIST) { - free(str); - return (-1); - } - break; - } - } - } - - /* Create directories starting from upmost non-existing parent */ - - while ((ptr = strchr(str, '\0')) != endptr) { - *ptr = '/'; - if (mkdir(str, mode) != 0 && errno != EEXIST) { - /* - * If the mkdir fails because str already - * exists (EEXIST), then str has the form - * "existing-dir/..", and this is really - * ok. (Remember, this loop is creating the - * portion of the path that didn't exist) - */ - free(str); - return (-1); - } - } - free(str); - return (0); -} - -/* - * simplify - given a pathname, simplify that path by removing - * duplicate contiguous slashes. - * - * A simplified copy of the argument is returned to the - * caller, or NULL is returned on error. - * - * The caller should handle error reporting based upon the - * returned vlaue, and should free the returned value, - * when appropriate. - */ - -static char * -simplify(const char *str) -{ - int i; - size_t mbPathlen; /* length of multi-byte path */ - size_t wcPathlen; /* length of wide-character path */ - wchar_t *wptr; /* scratch pointer */ - wchar_t *wcPath; /* wide-character version of the path */ - char *mbPath; /* The copy fo the path to be returned */ - - /* - * bail out if there is nothing there. - */ - - if (!str) - return (NULL); - - /* - * Get a copy of the argument. - */ - - if ((mbPath = strdup(str)) == NULL) { - return (NULL); - } - - /* - * convert the multi-byte version of the path to a - * wide-character rendering, for doing our figuring. - */ - - mbPathlen = strlen(mbPath); - - if ((wcPath = calloc(sizeof (wchar_t), mbPathlen+1)) == NULL) { - free(mbPath); - return (NULL); - } - - if ((wcPathlen = mbstowcs(wcPath, mbPath, mbPathlen)) == (size_t)-1) { - free(mbPath); - free(wcPath); - return (NULL); - } - - /* - * remove duplicate slashes first ("//../" -> "/") - */ - - for (wptr = wcPath, i = 0; i < wcPathlen; i++) { - *wptr++ = wcPath[i]; - - if (wcPath[i] == '/') { - i++; - - while (wcPath[i] == '/') { - i++; - } - - i--; - } - } - - *wptr = '\0'; - - /* - * now convert back to the multi-byte format. - */ - - if (wcstombs(mbPath, wcPath, mbPathlen) == (size_t)-1) { - free(mbPath); - free(wcPath); - return (NULL); - } - - free(wcPath); - return (mbPath); -} diff --git a/lib/libspl/strlcat.c b/lib/libspl/strlcat.c deleted file mode 100644 index 07d1403dde..0000000000 --- a/lib/libspl/strlcat.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "lint.h" -#include -#include - -/* - * Appends src to the dstsize buffer at dst. The append will never - * overflow the destination buffer and the buffer will always be null - * terminated. Never reference beyond &dst[dstsize-1] when computing - * the length of the pre-existing string. - */ - -size_t -strlcat(char *dst, const char *src, size_t dstsize) -{ - char *df = dst; - size_t left = dstsize; - size_t l1; - size_t l2 = strlen(src); - size_t copied; - - while (left-- != 0 && *df != '\0') - df++; - l1 = df - dst; - if (dstsize == l1) - return (l1 + l2); - - copied = l1 + l2 >= dstsize ? dstsize - l1 - 1 : l2; - (void) memcpy(dst + l1, src, copied); - dst[l1+copied] = '\0'; - return (l1 + l2); -} diff --git a/lib/libspl/strlcpy.c b/lib/libspl/strlcpy.c deleted file mode 100644 index 7a8009b893..0000000000 --- a/lib/libspl/strlcpy.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "lint.h" -#include -#include - -/* - * Copies src to the dstsize buffer at dst. The copy will never - * overflow the destination buffer and the buffer will always be null - * terminated. - */ - -size_t -strlcpy(char *dst, const char *src, size_t len) -{ - size_t slen = strlen(src); - size_t copied; - - if (len == 0) - return (slen); - - if (slen >= len) - copied = len - 1; - else - copied = slen; - (void) memcpy(dst, src, copied); - dst[copied] = '\0'; - return (slen); -} diff --git a/lib/libspl/strnlen.c b/lib/libspl/strnlen.c deleted file mode 100644 index 605245b6bb..0000000000 --- a/lib/libspl/strnlen.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. - * All rights reserved. Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "lint.h" -#include -#include - -/* - * Returns the number of non-NULL bytes in string argument, - * but not more than maxlen. Does not look past str + maxlen. - */ -size_t -strnlen(const char *str, size_t maxlen) -{ - const char *ptr; - - ptr = memchr(str, 0, maxlen); - if (ptr == NULL) - return (maxlen); - - return (ptr - str); -} diff --git a/lib/libspl/u8_textprep.c b/lib/libspl/u8_textprep.c deleted file mode 100644 index 35cafbaa9a..0000000000 --- a/lib/libspl/u8_textprep.c +++ /dev/null @@ -1,2133 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - - -/* - * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). - * - * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), - * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also - * the section 3C man pages. - * Interface stability: Committed. - */ - -#include -#ifdef _KERNEL -#include -#include -#include -#include -#include -#include -#include -#else -#include -#include -#endif /* _KERNEL */ -#include -#include -#include - - -/* The maximum possible number of bytes in a UTF-8 character. */ -#define U8_MB_CUR_MAX (4) - -/* - * The maximum number of bytes needed for a UTF-8 character to cover - * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. - */ -#define U8_MAX_BYTES_UCS2 (3) - -/* The maximum possible number of bytes in a Stream-Safe Text. */ -#define U8_STREAM_SAFE_TEXT_MAX (128) - -/* - * The maximum number of characters in a combining/conjoining sequence and - * the actual upperbound limit of a combining/conjoining sequence. - */ -#define U8_MAX_CHARS_A_SEQ (32) -#define U8_UPPER_LIMIT_IN_A_SEQ (31) - -/* The combining class value for Starter. */ -#define U8_COMBINING_CLASS_STARTER (0) - -/* - * Some Hangul related macros at below. - * - * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, - * Vowels, and optional Trailing consonants in Unicode scalar values. - * - * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not - * the actual U+11A8. This is due to that the trailing consonant is optional - * and thus we are doing a pre-calculation of subtracting one. - * - * Each of 19 modern leading consonants has total 588 possible syllables since - * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for - * no trailing consonant case, i.e., 21 x 28 = 588. - * - * We also have bunch of Hangul related macros at below. Please bear in mind - * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is - * a Hangul Jamo or not but the value does not guarantee that it is a Hangul - * Jamo; it just guarantee that it will be most likely. - */ -#define U8_HANGUL_SYL_FIRST (0xAC00U) -#define U8_HANGUL_SYL_LAST (0xD7A3U) - -#define U8_HANGUL_JAMO_L_FIRST (0x1100U) -#define U8_HANGUL_JAMO_L_LAST (0x1112U) -#define U8_HANGUL_JAMO_V_FIRST (0x1161U) -#define U8_HANGUL_JAMO_V_LAST (0x1175U) -#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) -#define U8_HANGUL_JAMO_T_LAST (0x11C2U) - -#define U8_HANGUL_V_COUNT (21) -#define U8_HANGUL_VT_COUNT (588) -#define U8_HANGUL_T_COUNT (28) - -#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) - -#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ - (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ - (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ - (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); - -#define U8_HANGUL_JAMO_L(u) \ - ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) - -#define U8_HANGUL_JAMO_V(u) \ - ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) - -#define U8_HANGUL_JAMO_T(u) \ - ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) - -#define U8_HANGUL_JAMO(u) \ - ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) - -#define U8_HANGUL_SYLLABLE(u) \ - ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) - -#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ - ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) - -#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ - ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) - -/* The types of decomposition mappings. */ -#define U8_DECOMP_BOTH (0xF5U) -#define U8_DECOMP_CANONICAL (0xF6U) - -/* The indicator for 16-bit table. */ -#define U8_16BIT_TABLE_INDICATOR (0x8000U) - -/* The following are some convenience macros. */ -#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ - (u) = (((uint32_t)(b1) & 0x0F) << 12) | \ - (((uint32_t)(b2) & 0x3F) << 6) | \ - ((uint32_t)(b3) & 0x3F); - -#define U8_SIMPLE_SWAP(a, b, t) \ - (t) = (a); \ - (a) = (b); \ - (b) = (t); - -#define U8_ASCII_TOUPPER(c) \ - (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) - -#define U8_ASCII_TOLOWER(c) \ - (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) - -#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) -/* - * The following macro assumes that the two characters that are to be - * swapped are adjacent to each other and 'a' comes before 'b'. - * - * If the assumptions are not met, then, the macro will fail. - */ -#define U8_SWAP_COMB_MARKS(a, b) \ - for (k = 0; k < disp[(a)]; k++) \ - u8t[k] = u8s[start[(a)] + k]; \ - for (k = 0; k < disp[(b)]; k++) \ - u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ - start[(b)] = start[(a)] + disp[(b)]; \ - for (k = 0; k < disp[(a)]; k++) \ - u8s[start[(b)] + k] = u8t[k]; \ - U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ - U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); - -/* The possible states during normalization. */ -typedef enum { - U8_STATE_START = 0, - U8_STATE_HANGUL_L = 1, - U8_STATE_HANGUL_LV = 2, - U8_STATE_HANGUL_LVT = 3, - U8_STATE_HANGUL_V = 4, - U8_STATE_HANGUL_T = 5, - U8_STATE_COMBINING_MARK = 6 -} u8_normalization_states_t; - -/* - * The three vectors at below are used to check bytes of a given UTF-8 - * character are valid and not containing any malformed byte values. - * - * We used to have a quite relaxed UTF-8 binary representation but then there - * was some security related issues and so the Unicode Consortium defined - * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it - * one more time at the Unicode 3.2. The following three tables are based on - * that. - */ - -#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) - -#define I_ U8_ILLEGAL_CHAR -#define O_ U8_OUT_OF_RANGE_CHAR - -const int8_t u8_number_of_bytes[0x100] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - -/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ - I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - -/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - -/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - -/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ - 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, -}; - -#undef I_ -#undef O_ - -const uint8_t u8_valid_min_2nd_byte[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -/* C0 C1 C2 C3 C4 C5 C6 C7 */ - 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* C8 C9 CA CB CC CD CE CF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* D0 D1 D2 D3 D4 D5 D6 D7 */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* D8 D9 DA DB DC DD DE DF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* E0 E1 E2 E3 E4 E5 E6 E7 */ - 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* E8 E9 EA EB EC ED EE EF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* F0 F1 F2 F3 F4 F5 F6 F7 */ - 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -const uint8_t u8_valid_max_2nd_byte[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -/* C0 C1 C2 C3 C4 C5 C6 C7 */ - 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* C8 C9 CA CB CC CD CE CF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* D0 D1 D2 D3 D4 D5 D6 D7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* D8 D9 DA DB DC DD DE DF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* E0 E1 E2 E3 E4 E5 E6 E7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* E8 E9 EA EB EC ED EE EF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, -/* F0 F1 F2 F3 F4 F5 F6 F7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -}; - - -/* - * The u8_validate() validates on the given UTF-8 character string and - * calculate the byte length. It is quite similar to mblen(3C) except that - * this will validate against the list of characters if required and - * specific to UTF-8 and Unicode. - */ -int -u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) -{ - uchar_t *ib; - uchar_t *ibtail; - uchar_t **p; - uchar_t *s1; - uchar_t *s2; - uchar_t f; - int sz; - size_t i; - int ret_val; - boolean_t second; - boolean_t no_need_to_validate_entire; - boolean_t check_additional; - boolean_t validate_ucs2_range_only; - - if (! u8str) - return (0); - - ib = (uchar_t *)u8str; - ibtail = ib + n; - - ret_val = 0; - - no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); - check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; - validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; - - while (ib < ibtail) { - /* - * The first byte of a UTF-8 character tells how many - * bytes will follow for the character. If the first byte - * is an illegal byte value or out of range value, we just - * return -1 with an appropriate error number. - */ - sz = u8_number_of_bytes[*ib]; - if (sz == U8_ILLEGAL_CHAR) { - *errnum = EILSEQ; - return (-1); - } - - if (sz == U8_OUT_OF_RANGE_CHAR || - (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { - *errnum = ERANGE; - return (-1); - } - - /* - * If we don't have enough bytes to check on, that's also - * an error. As you can see, we give illegal byte sequence - * checking higher priority then EINVAL cases. - */ - if ((ibtail - ib) < sz) { - *errnum = EINVAL; - return (-1); - } - - if (sz == 1) { - ib++; - ret_val++; - } else { - /* - * Check on the multi-byte UTF-8 character. For more - * details on this, see comment added for the used - * data structures at the beginning of the file. - */ - f = *ib++; - ret_val++; - second = B_TRUE; - for (i = 1; i < sz; i++) { - if (second) { - if (*ib < u8_valid_min_2nd_byte[f] || - *ib > u8_valid_max_2nd_byte[f]) { - *errnum = EILSEQ; - return (-1); - } - second = B_FALSE; - } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { - *errnum = EILSEQ; - return (-1); - } - ib++; - ret_val++; - } - } - - if (check_additional) { - for (p = (uchar_t **)list, i = 0; p[i]; i++) { - s1 = ib - sz; - s2 = p[i]; - while (s1 < ib) { - if (*s1 != *s2 || *s2 == '\0') - break; - s1++; - s2++; - } - - if (s1 >= ib && *s2 == '\0') { - *errnum = EBADF; - return (-1); - } - } - } - - if (no_need_to_validate_entire) - break; - } - - return (ret_val); -} - -/* - * The do_case_conv() looks at the mapping tables and returns found - * bytes if any. If not found, the input bytes are returned. The function - * always terminate the return bytes with a null character assuming that - * there are plenty of room to do so. - * - * The case conversions are simple case conversions mapping a character to - * another character as specified in the Unicode data. The byte size of - * the mapped character could be different from that of the input character. - * - * The return value is the byte length of the returned character excluding - * the terminating null byte. - */ -static size_t -do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) -{ - size_t i; - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - - /* - * At this point, the only possible values for sz are 2, 3, and 4. - * The u8s should point to a vector that is well beyond the size of - * 5 bytes. - */ - if (sz == 2) { - b3 = u8s[0] = s[0]; - b4 = u8s[1] = s[1]; - } else if (sz == 3) { - b2 = u8s[0] = s[0]; - b3 = u8s[1] = s[1]; - b4 = u8s[2] = s[2]; - } else if (sz == 4) { - b1 = u8s[0] = s[0]; - b2 = u8s[1] = s[1]; - b3 = u8s[2] = s[2]; - b4 = u8s[3] = s[3]; - } else { - /* This is not possible but just in case as a fallback. */ - if (is_it_toupper) - *u8s = U8_ASCII_TOUPPER(*s); - else - *u8s = U8_ASCII_TOLOWER(*s); - u8s[1] = '\0'; - - return (1); - } - u8s[sz] = '\0'; - - /* - * Let's find out if we have a corresponding character. - */ - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b2 = u8_case_common_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - if (is_it_toupper) { - b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; - - /* Either there is no match or an error at the table. */ - if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) - return ((size_t)sz); - - b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; - } else { - b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; - - if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) - return ((size_t)sz); - - b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; - } - - /* - * If i is still zero, that means there is no corresponding character. - */ - if (i == 0) - return ((size_t)sz); - - u8s[i] = '\0'; - - return (i); -} - -/* - * The do_case_compare() function compares the two input strings, s1 and s2, - * one character at a time doing case conversions if applicable and return - * the comparison result as like strcmp(). - * - * Since, in empirical sense, most of text data are 7-bit ASCII characters, - * we treat the 7-bit ASCII characters as a special case trying to yield - * faster processing time. - */ -static int -do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, - size_t n2, boolean_t is_it_toupper, int *errnum) -{ - int f; - int sz1; - int sz2; - size_t j; - size_t i1; - size_t i2; - uchar_t u8s1[U8_MB_CUR_MAX + 1]; - uchar_t u8s2[U8_MB_CUR_MAX + 1]; - - i1 = i2 = 0; - while (i1 < n1 && i2 < n2) { - /* - * Find out what would be the byte length for this UTF-8 - * character at string s1 and also find out if this is - * an illegal start byte or not and if so, issue a proper - * error number and yet treat this byte as a character. - */ - sz1 = u8_number_of_bytes[*s1]; - if (sz1 < 0) { - *errnum = EILSEQ; - sz1 = 1; - } - - /* - * For 7-bit ASCII characters mainly, we do a quick case - * conversion right at here. - * - * If we don't have enough bytes for this character, issue - * an EINVAL error and use what are available. - * - * If we have enough bytes, find out if there is - * a corresponding uppercase character and if so, copy over - * the bytes for a comparison later. If there is no - * corresponding uppercase character, then, use what we have - * for the comparison. - */ - if (sz1 == 1) { - if (is_it_toupper) - u8s1[0] = U8_ASCII_TOUPPER(*s1); - else - u8s1[0] = U8_ASCII_TOLOWER(*s1); - s1++; - u8s1[1] = '\0'; - } else if ((i1 + sz1) > n1) { - *errnum = EINVAL; - for (j = 0; (i1 + j) < n1; ) - u8s1[j++] = *s1++; - u8s1[j] = '\0'; - } else { - (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); - s1 += sz1; - } - - /* Do the same for the string s2. */ - sz2 = u8_number_of_bytes[*s2]; - if (sz2 < 0) { - *errnum = EILSEQ; - sz2 = 1; - } - - if (sz2 == 1) { - if (is_it_toupper) - u8s2[0] = U8_ASCII_TOUPPER(*s2); - else - u8s2[0] = U8_ASCII_TOLOWER(*s2); - s2++; - u8s2[1] = '\0'; - } else if ((i2 + sz2) > n2) { - *errnum = EINVAL; - for (j = 0; (i2 + j) < n2; ) - u8s2[j++] = *s2++; - u8s2[j] = '\0'; - } else { - (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); - s2 += sz2; - } - - /* Now compare the two characters. */ - if (sz1 == 1 && sz2 == 1) { - if (*u8s1 > *u8s2) - return (1); - if (*u8s1 < *u8s2) - return (-1); - } else { - f = strcmp((const char *)u8s1, (const char *)u8s2); - if (f != 0) - return (f); - } - - /* - * They were the same. Let's move on to the next - * characters then. - */ - i1 += sz1; - i2 += sz2; - } - - /* - * We compared until the end of either or both strings. - * - * If we reached to or went over the ends for the both, that means - * they are the same. - * - * If we reached only one of the two ends, that means the other string - * has something which then the fact can be used to determine - * the return value. - */ - if (i1 >= n1) { - if (i2 >= n2) - return (0); - return (-1); - } - return (1); -} - -/* - * The combining_class() function checks on the given bytes and find out - * the corresponding Unicode combining class value. The return value 0 means - * it is a Starter. Any illegal UTF-8 character will also be treated as - * a Starter. - */ -static uchar_t -combining_class(size_t uv, uchar_t *s, size_t sz) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b4 = 0; - - if (sz == 1 || sz > 4) - return (0); - - if (sz == 2) { - b3 = s[0]; - b4 = s[1]; - } else if (sz == 3) { - b2 = s[0]; - b3 = s[1]; - b4 = s[2]; - } else if (sz == 4) { - b1 = s[0]; - b2 = s[1]; - b3 = s[2]; - b4 = s[3]; - } - - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - b2 = u8_combining_class_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - b3 = u8_combining_class_b3_tbl[uv][b2][b3]; - if (b3 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - return (u8_combining_class_b4_tbl[uv][b3][b4]); -} - -/* - * The do_decomp() function finds out a matching decomposition if any - * and return. If there is no match, the input bytes are copied and returned. - * The function also checks if there is a Hangul, decomposes it if necessary - * and returns. - * - * To save time, a single byte 7-bit ASCII character should be handled by - * the caller. - * - * The function returns the number of bytes returned sans always terminating - * the null byte. It will also return a state that will tell if there was - * a Hangul character decomposed which then will be used by the caller. - */ -static size_t -do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, - boolean_t canonical_decomposition, u8_normalization_states_t *state) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - size_t i; - uint32_t u1; - - if (sz == 2) { - b3 = u8s[0] = s[0]; - b4 = u8s[1] = s[1]; - u8s[2] = '\0'; - } else if (sz == 3) { - /* Convert it to a Unicode scalar value. */ - U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); - - /* - * If this is a Hangul syllable, we decompose it into - * a leading consonant, a vowel, and an optional trailing - * consonant and then return. - */ - if (U8_HANGUL_SYLLABLE(u1)) { - u1 -= U8_HANGUL_SYL_FIRST; - - b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; - b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) - / U8_HANGUL_T_COUNT; - b3 = u1 % U8_HANGUL_T_COUNT; - - U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); - U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); - if (b3) { - b3 += U8_HANGUL_JAMO_T_FIRST; - U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); - - u8s[9] = '\0'; - *state = U8_STATE_HANGUL_LVT; - return (9); - } - - u8s[6] = '\0'; - *state = U8_STATE_HANGUL_LV; - return (6); - } - - b2 = u8s[0] = s[0]; - b3 = u8s[1] = s[1]; - b4 = u8s[2] = s[2]; - u8s[3] = '\0'; - - /* - * If this is a Hangul Jamo, we know there is nothing - * further that we can decompose. - */ - if (U8_HANGUL_JAMO_L(u1)) { - *state = U8_STATE_HANGUL_L; - return (3); - } - - if (U8_HANGUL_JAMO_V(u1)) { - if (*state == U8_STATE_HANGUL_L) - *state = U8_STATE_HANGUL_LV; - else - *state = U8_STATE_HANGUL_V; - return (3); - } - - if (U8_HANGUL_JAMO_T(u1)) { - if (*state == U8_STATE_HANGUL_LV) - *state = U8_STATE_HANGUL_LVT; - else - *state = U8_STATE_HANGUL_T; - return (3); - } - } else if (sz == 4) { - b1 = u8s[0] = s[0]; - b2 = u8s[1] = s[1]; - b3 = u8s[2] = s[2]; - b4 = u8s[3] = s[3]; - u8s[4] = '\0'; - } else { - /* - * This is a fallback and should not happen if the function - * was called properly. - */ - u8s[0] = s[0]; - u8s[1] = '\0'; - *state = U8_STATE_START; - return (1); - } - - /* - * At this point, this rountine does not know what it would get. - * The caller should sort it out if the state isn't a Hangul one. - */ - *state = U8_STATE_START; - - /* Try to find matching decomposition mapping byte sequence. */ - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b2 = u8_decomp_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - /* - * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR - * which is 0x8000, this means we couldn't fit the mappings into - * the cardinality of a unsigned byte. - */ - if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { - b3_tbl -= U8_16BIT_TABLE_INDICATOR; - start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; - end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; - } else { - start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; - } - - /* This also means there wasn't any matching decomposition. */ - if (start_id >= end_id) - return ((size_t)sz); - - /* - * The final table for decomposition mappings has three types of - * byte sequences depending on whether a mapping is for compatibility - * decomposition, canonical decomposition, or both like the following: - * - * (1) Compatibility decomposition mappings: - * - * +---+---+-...-+---+ - * | B0| B1| ... | Bm| - * +---+---+-...-+---+ - * - * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). - * - * (2) Canonical decomposition mappings: - * - * +---+---+---+-...-+---+ - * | T | b0| b1| ... | bn| - * +---+---+---+-...-+---+ - * - * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). - * - * (3) Both mappings: - * - * +---+---+---+---+-...-+---+---+---+-...-+---+ - * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| - * +---+---+---+---+-...-+---+---+---+-...-+---+ - * - * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement - * byte, b0 to bn are canonical mapping bytes and B0 to Bm are - * compatibility mapping bytes. - * - * Note that compatibility decomposition means doing recursive - * decompositions using both compatibility decomposition mappings and - * canonical decomposition mappings. On the other hand, canonical - * decomposition means doing recursive decompositions using only - * canonical decomposition mappings. Since the table we have has gone - * through the recursions already, we do not need to do so during - * runtime, i.e., the table has been completely flattened out - * already. - */ - - b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; - - /* Get the type, T, of the byte sequence. */ - b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; - - /* - * If necessary, adjust start_id, end_id, or both. Note that if - * this is compatibility decomposition mapping, there is no - * adjustment. - */ - if (canonical_decomposition) { - /* Is the mapping only for compatibility decomposition? */ - if (b1 < U8_DECOMP_BOTH) - return ((size_t)sz); - - start_id++; - - if (b1 == U8_DECOMP_BOTH) { - end_id = start_id + - u8_decomp_final_tbl[uv][b3_base + start_id]; - start_id++; - } - } else { - /* - * Unless this is a compatibility decomposition mapping, - * we adjust the start_id. - */ - if (b1 == U8_DECOMP_BOTH) { - start_id++; - start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; - } else if (b1 == U8_DECOMP_CANONICAL) { - start_id++; - } - } - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; - u8s[i] = '\0'; - - return (i); -} - -/* - * The find_composition_start() function uses the character bytes given and - * find out the matching composition mappings if any and return the address - * to the composition mappings as explained in the do_composition(). - */ -static uchar_t * -find_composition_start(size_t uv, uchar_t *s, size_t sz) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - - if (sz == 1) { - b4 = s[0]; - } else if (sz == 2) { - b3 = s[0]; - b4 = s[1]; - } else if (sz == 3) { - b2 = s[0]; - b3 = s[1]; - b4 = s[2]; - } else if (sz == 4) { - b1 = s[0]; - b2 = s[1]; - b3 = s[2]; - b4 = s[3]; - } else { - /* - * This is a fallback and should not happen if the function - * was called properly. - */ - return (NULL); - } - - b1 = u8_composition_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - b2 = u8_composition_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { - b3_tbl -= U8_16BIT_TABLE_INDICATOR; - start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; - end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; - } else { - start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; - } - - if (start_id >= end_id) - return (NULL); - - b3_base = u8_composition_b3_tbl[uv][b2][b3].base; - - return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); -} - -/* - * The blocked() function checks on the combining class values of previous - * characters in this sequence and return whether it is blocked or not. - */ -static boolean_t -blocked(uchar_t *comb_class, size_t last) -{ - uchar_t my_comb_class; - size_t i; - - my_comb_class = comb_class[last]; - for (i = 1; i < last; i++) - if (comb_class[i] >= my_comb_class || - comb_class[i] == U8_COMBINING_CLASS_STARTER) - return (B_TRUE); - - return (B_FALSE); -} - -/* - * The do_composition() reads the character string pointed by 's' and - * do necessary canonical composition and then copy over the result back to - * the 's'. - * - * The input argument 's' cannot contain more than 32 characters. - */ -static size_t -do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, - uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) -{ - uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t tc[U8_MB_CUR_MAX]; - uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; - size_t saved_marks_count; - uchar_t *p; - uchar_t *saved_p; - uchar_t *q; - size_t i; - size_t saved_i; - size_t j; - size_t k; - size_t l; - size_t C; - size_t saved_l; - size_t size; - uint32_t u1; - uint32_t u2; - boolean_t match_not_found = B_TRUE; - - /* - * This should never happen unless the callers are doing some strange - * and unexpected things. - * - * The "last" is the index pointing to the last character not last + 1. - */ - if (last >= U8_MAX_CHARS_A_SEQ) - last = U8_UPPER_LIMIT_IN_A_SEQ; - - for (i = l = 0; i <= last; i++) { - /* - * The last or any non-Starters at the beginning, we don't - * have any chance to do composition and so we just copy them - * to the temporary buffer. - */ - if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { -SAVE_THE_CHAR: - p = s + start[i]; - size = disp[i]; - for (k = 0; k < size; k++) - t[l++] = *p++; - continue; - } - - /* - * If this could be a start of Hangul Jamos, then, we try to - * conjoin them. - */ - if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { - U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], - s[start[i] + 1], s[start[i] + 2]); - U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], - s[start[i] + 4], s[start[i] + 5]); - - if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { - u1 -= U8_HANGUL_JAMO_L_FIRST; - u2 -= U8_HANGUL_JAMO_V_FIRST; - u1 = U8_HANGUL_SYL_FIRST + - (u1 * U8_HANGUL_V_COUNT + u2) * - U8_HANGUL_T_COUNT; - - i += 2; - if (i <= last) { - U8_PUT_3BYTES_INTO_UTF32(u2, - s[start[i]], s[start[i] + 1], - s[start[i] + 2]); - - if (U8_HANGUL_JAMO_T(u2)) { - u1 += u2 - - U8_HANGUL_JAMO_T_FIRST; - i++; - } - } - - U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); - i--; - l += 3; - continue; - } - } - - /* - * Let's then find out if this Starter has composition - * mapping. - */ - p = find_composition_start(uv, s + start[i], disp[i]); - if (p == NULL) - goto SAVE_THE_CHAR; - - /* - * We have a Starter with composition mapping and the next - * character is a non-Starter. Let's try to find out if - * we can do composition. - */ - - saved_p = p; - saved_i = i; - saved_l = l; - saved_marks_count = 0; - -TRY_THE_NEXT_MARK: - q = s + start[++i]; - size = disp[i]; - - /* - * The next for() loop compares the non-Starter pointed by - * 'q' with the possible (joinable) characters pointed by 'p'. - * - * The composition final table entry pointed by the 'p' - * looks like the following: - * - * +---+---+---+-...-+---+---+---+---+-...-+---+---+ - * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | - * +---+---+---+-...-+---+---+---+---+-...-+---+---+ - * - * where C is the count byte indicating the number of - * mapping pairs where each pair would be look like - * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second - * character of a canonical decomposition and the B0-Bm are - * the bytes of a matching composite character. The F is - * a filler byte after each character as the separator. - */ - - match_not_found = B_TRUE; - - for (C = *p++; C > 0; C--) { - for (k = 0; k < size; p++, k++) - if (*p != q[k]) - break; - - /* Have we found it? */ - if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { - match_not_found = B_FALSE; - - l = saved_l; - - while (*++p != U8_TBL_ELEMENT_FILLER) - t[l++] = *p; - - break; - } - - /* We didn't find; skip to the next pair. */ - if (*p != U8_TBL_ELEMENT_FILLER) - while (*++p != U8_TBL_ELEMENT_FILLER) - ; - while (*++p != U8_TBL_ELEMENT_FILLER) - ; - p++; - } - - /* - * If there was no match, we will need to save the combining - * mark for later appending. After that, if the next one - * is a non-Starter and not blocked, then, we try once - * again to do composition with the next non-Starter. - * - * If there was no match and this was a Starter, then, - * this is a new start. - * - * If there was a match and a composition done and we have - * more to check on, then, we retrieve a new composition final - * table entry for the composite and then try to do the - * composition again. - */ - - if (match_not_found) { - if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { - i--; - goto SAVE_THE_CHAR; - } - - saved_marks[saved_marks_count++] = i; - } - - if (saved_l == l) { - while (i < last) { - if (blocked(comb_class, i + 1)) - saved_marks[saved_marks_count++] = ++i; - else - break; - } - if (i < last) { - p = saved_p; - goto TRY_THE_NEXT_MARK; - } - } else if (i < last) { - p = find_composition_start(uv, t + saved_l, - l - saved_l); - if (p != NULL) { - saved_p = p; - goto TRY_THE_NEXT_MARK; - } - } - - /* - * There is no more composition possible. - * - * If there was no composition what so ever then we copy - * over the original Starter and then append any non-Starters - * remaining at the target string sequentially after that. - */ - - if (saved_l == l) { - p = s + start[saved_i]; - size = disp[saved_i]; - for (j = 0; j < size; j++) - t[l++] = *p++; - } - - for (k = 0; k < saved_marks_count; k++) { - p = s + start[saved_marks[k]]; - size = disp[saved_marks[k]]; - for (j = 0; j < size; j++) - t[l++] = *p++; - } - } - - /* - * If the last character is a Starter and if we have a character - * (possibly another Starter) that can be turned into a composite, - * we do so and we do so until there is no more of composition - * possible. - */ - if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { - p = *os; - saved_l = l - disp[last]; - - while (p < oslast) { - size = u8_number_of_bytes[*p]; - if (size <= 1 || (p + size) > oslast) - break; - - saved_p = p; - - for (i = 0; i < size; i++) - tc[i] = *p++; - - q = find_composition_start(uv, t + saved_l, - l - saved_l); - if (q == NULL) { - p = saved_p; - break; - } - - match_not_found = B_TRUE; - - for (C = *q++; C > 0; C--) { - for (k = 0; k < size; q++, k++) - if (*q != tc[k]) - break; - - if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { - match_not_found = B_FALSE; - - l = saved_l; - - while (*++q != U8_TBL_ELEMENT_FILLER) { - /* - * This is practically - * impossible but we don't - * want to take any chances. - */ - if (l >= - U8_STREAM_SAFE_TEXT_MAX) { - p = saved_p; - goto SAFE_RETURN; - } - t[l++] = *q; - } - - break; - } - - if (*q != U8_TBL_ELEMENT_FILLER) - while (*++q != U8_TBL_ELEMENT_FILLER) - ; - while (*++q != U8_TBL_ELEMENT_FILLER) - ; - q++; - } - - if (match_not_found) { - p = saved_p; - break; - } - } -SAFE_RETURN: - *os = p; - } - - /* - * Now we copy over the temporary string to the target string. - * Since composition always reduces the number of characters or - * the number of characters stay, we don't need to worry about - * the buffer overflow here. - */ - for (i = 0; i < l; i++) - s[i] = t[i]; - s[l] = '\0'; - - return (l); -} - -/* - * The collect_a_seq() function checks on the given string s, collect - * a sequence of characters at u8s, and return the sequence. While it collects - * a sequence, it also applies case conversion, canonical or compatibility - * decomposition, canonical decomposition, or some or all of them and - * in that order. - * - * The collected sequence cannot be bigger than 32 characters since if - * it is having more than 31 characters, the sequence will be terminated - * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into - * a Stream-Safe Text. The collected sequence is always terminated with - * a null byte and the return value is the byte length of the sequence - * including 0. The return value does not include the terminating - * null byte. - */ -static size_t -collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, - boolean_t is_it_toupper, - boolean_t is_it_tolower, - boolean_t canonical_decomposition, - boolean_t compatibility_decomposition, - boolean_t canonical_composition, - int *errnum, u8_normalization_states_t *state) -{ - uchar_t *s; - int sz; - int saved_sz; - size_t i; - size_t j; - size_t k; - size_t l; - uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; - uchar_t disp[U8_MAX_CHARS_A_SEQ]; - uchar_t start[U8_MAX_CHARS_A_SEQ]; - uchar_t u8t[U8_MB_CUR_MAX]; - uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t tc; - size_t last; - size_t saved_last; - uint32_t u1; - - /* - * Save the source string pointer which we will return a changed - * pointer if we do processing. - */ - s = *source; - - /* - * The following is a fallback for just in case callers are not - * checking the string boundaries before the calling. - */ - if (s >= slast) { - u8s[0] = '\0'; - - return (0); - } - - /* - * As the first thing, let's collect a character and do case - * conversion if necessary. - */ - - sz = u8_number_of_bytes[*s]; - - if (sz < 0) { - *errnum = EILSEQ; - - u8s[0] = *s++; - u8s[1] = '\0'; - - *source = s; - - return (1); - } - - if (sz == 1) { - if (is_it_toupper) - u8s[0] = U8_ASCII_TOUPPER(*s); - else if (is_it_tolower) - u8s[0] = U8_ASCII_TOLOWER(*s); - else - u8s[0] = *s; - s++; - u8s[1] = '\0'; - } else if ((s + sz) > slast) { - *errnum = EINVAL; - - for (i = 0; s < slast; ) - u8s[i++] = *s++; - u8s[i] = '\0'; - - *source = s; - - return (i); - } else { - if (is_it_toupper || is_it_tolower) { - i = do_case_conv(uv, u8s, s, sz, is_it_toupper); - s += sz; - sz = i; - } else { - for (i = 0; i < sz; ) - u8s[i++] = *s++; - u8s[i] = '\0'; - } - } - - /* - * And then canonical/compatibility decomposition followed by - * an optional canonical composition. Please be noted that - * canonical composition is done only when a decomposition is - * done. - */ - if (canonical_decomposition || compatibility_decomposition) { - if (sz == 1) { - *state = U8_STATE_START; - - saved_sz = 1; - - comb_class[0] = 0; - start[0] = 0; - disp[0] = 1; - - last = 1; - } else { - saved_sz = do_decomp(uv, u8s, u8s, sz, - canonical_decomposition, state); - - last = 0; - - for (i = 0; i < saved_sz; ) { - sz = u8_number_of_bytes[u8s[i]]; - - comb_class[last] = combining_class(uv, - u8s + i, sz); - start[last] = i; - disp[last] = sz; - - last++; - i += sz; - } - - /* - * Decomposition yields various Hangul related - * states but not on combining marks. We need to - * find out at here by checking on the last - * character. - */ - if (*state == U8_STATE_START) { - if (comb_class[last - 1]) - *state = U8_STATE_COMBINING_MARK; - } - } - - saved_last = last; - - while (s < slast) { - sz = u8_number_of_bytes[*s]; - - /* - * If this is an illegal character, an incomplete - * character, or an 7-bit ASCII Starter character, - * then we have collected a sequence; break and let - * the next call deal with the two cases. - * - * Note that this is okay only if you are using this - * function with a fixed length string, not on - * a buffer with multiple calls of one chunk at a time. - */ - if (sz <= 1) { - break; - } else if ((s + sz) > slast) { - break; - } else { - /* - * If the previous character was a Hangul Jamo - * and this character is a Hangul Jamo that - * can be conjoined, we collect the Jamo. - */ - if (*s == U8_HANGUL_JAMO_1ST_BYTE) { - U8_PUT_3BYTES_INTO_UTF32(u1, - *s, *(s + 1), *(s + 2)); - - if (U8_HANGUL_COMPOSABLE_L_V(*state, - u1)) { - i = 0; - *state = U8_STATE_HANGUL_LV; - goto COLLECT_A_HANGUL; - } - - if (U8_HANGUL_COMPOSABLE_LV_T(*state, - u1)) { - i = 0; - *state = U8_STATE_HANGUL_LVT; - goto COLLECT_A_HANGUL; - } - } - - /* - * Regardless of whatever it was, if this is - * a Starter, we don't collect the character - * since that's a new start and we will deal - * with it at the next time. - */ - i = combining_class(uv, s, sz); - if (i == U8_COMBINING_CLASS_STARTER) - break; - - /* - * We know the current character is a combining - * mark. If the previous character wasn't - * a Starter (not Hangul) or a combining mark, - * then, we don't collect this combining mark. - */ - if (*state != U8_STATE_START && - *state != U8_STATE_COMBINING_MARK) - break; - - *state = U8_STATE_COMBINING_MARK; -COLLECT_A_HANGUL: - /* - * If we collected a Starter and combining - * marks up to 30, i.e., total 31 characters, - * then, we terminate this degenerately long - * combining sequence with a U+034F COMBINING - * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in - * UTF-8 and turn this into a Stream-Safe - * Text. This will be extremely rare but - * possible. - * - * The following will also guarantee that - * we are not writing more than 32 characters - * plus a NULL at u8s[]. - */ - if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { -TURN_STREAM_SAFE: - *state = U8_STATE_START; - comb_class[last] = 0; - start[last] = saved_sz; - disp[last] = 2; - last++; - - u8s[saved_sz++] = 0xCD; - u8s[saved_sz++] = 0x8F; - - break; - } - - /* - * Some combining marks also do decompose into - * another combining mark or marks. - */ - if (*state == U8_STATE_COMBINING_MARK) { - k = last; - l = sz; - i = do_decomp(uv, uts, s, sz, - canonical_decomposition, state); - for (j = 0; j < i; ) { - sz = u8_number_of_bytes[uts[j]]; - - comb_class[last] = - combining_class(uv, - uts + j, sz); - start[last] = saved_sz + j; - disp[last] = sz; - - last++; - if (last >= - U8_UPPER_LIMIT_IN_A_SEQ) { - last = k; - goto TURN_STREAM_SAFE; - } - j += sz; - } - - *state = U8_STATE_COMBINING_MARK; - sz = i; - s += l; - - for (i = 0; i < sz; i++) - u8s[saved_sz++] = uts[i]; - } else { - comb_class[last] = i; - start[last] = saved_sz; - disp[last] = sz; - last++; - - for (i = 0; i < sz; i++) - u8s[saved_sz++] = *s++; - } - - /* - * If this is U+0345 COMBINING GREEK - * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., - * iota subscript, and need to be converted to - * uppercase letter, convert it to U+0399 GREEK - * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), - * i.e., convert to capital adscript form as - * specified in the Unicode standard. - * - * This is the only special case of (ambiguous) - * case conversion at combining marks and - * probably the standard will never have - * anything similar like this in future. - */ - if (is_it_toupper && sz >= 2 && - u8s[saved_sz - 2] == 0xCD && - u8s[saved_sz - 1] == 0x85) { - u8s[saved_sz - 2] = 0xCE; - u8s[saved_sz - 1] = 0x99; - } - } - } - - /* - * Let's try to ensure a canonical ordering for the collected - * combining marks. We do this only if we have collected - * at least one more non-Starter. (The decomposition mapping - * data tables have fully (and recursively) expanded and - * canonically ordered decompositions.) - * - * The U8_SWAP_COMB_MARKS() convenience macro has some - * assumptions and we are meeting the assumptions. - */ - last--; - if (last >= saved_last) { - for (i = 0; i < last; i++) - for (j = last; j > i; j--) - if (comb_class[j] && - comb_class[j - 1] > comb_class[j]) { - U8_SWAP_COMB_MARKS(j - 1, j); - } - } - - *source = s; - - if (! canonical_composition) { - u8s[saved_sz] = '\0'; - return (saved_sz); - } - - /* - * Now do the canonical composition. Note that we do this - * only after a canonical or compatibility decomposition to - * finish up NFC or NFKC. - */ - sz = do_composition(uv, u8s, comb_class, start, disp, last, - &s, slast); - } - - *source = s; - - return ((size_t)sz); -} - -/* - * The do_norm_compare() function does string comparion based on Unicode - * simple case mappings and Unicode Normalization definitions. - * - * It does so by collecting a sequence of character at a time and comparing - * the collected sequences from the strings. - * - * The meanings on the return values are the same as the usual strcmp(). - */ -static int -do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, - int flag, int *errnum) -{ - int result; - size_t sz1; - size_t sz2; - uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t *s1last; - uchar_t *s2last; - boolean_t is_it_toupper; - boolean_t is_it_tolower; - boolean_t canonical_decomposition; - boolean_t compatibility_decomposition; - boolean_t canonical_composition; - u8_normalization_states_t state; - - s1last = s1 + n1; - s2last = s2 + n2; - - is_it_toupper = flag & U8_TEXTPREP_TOUPPER; - is_it_tolower = flag & U8_TEXTPREP_TOLOWER; - canonical_decomposition = flag & U8_CANON_DECOMP; - compatibility_decomposition = flag & U8_COMPAT_DECOMP; - canonical_composition = flag & U8_CANON_COMP; - - while (s1 < s1last && s2 < s2last) { - /* - * If the current character is a 7-bit ASCII and the last - * character, or, if the current character and the next - * character are both some 7-bit ASCII characters then - * we treat the current character as a sequence. - * - * In any other cases, we need to call collect_a_seq(). - */ - - if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || - ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { - if (is_it_toupper) - u8s1[0] = U8_ASCII_TOUPPER(*s1); - else if (is_it_tolower) - u8s1[0] = U8_ASCII_TOLOWER(*s1); - else - u8s1[0] = *s1; - u8s1[1] = '\0'; - sz1 = 1; - s1++; - } else { - state = U8_STATE_START; - sz1 = collect_a_seq(uv, u8s1, &s1, s1last, - is_it_toupper, is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, errnum, &state); - } - - if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || - ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { - if (is_it_toupper) - u8s2[0] = U8_ASCII_TOUPPER(*s2); - else if (is_it_tolower) - u8s2[0] = U8_ASCII_TOLOWER(*s2); - else - u8s2[0] = *s2; - u8s2[1] = '\0'; - sz2 = 1; - s2++; - } else { - state = U8_STATE_START; - sz2 = collect_a_seq(uv, u8s2, &s2, s2last, - is_it_toupper, is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, errnum, &state); - } - - /* - * Now compare the two characters. If they are the same, - * we move on to the next character sequences. - */ - if (sz1 == 1 && sz2 == 1) { - if (*u8s1 > *u8s2) - return (1); - if (*u8s1 < *u8s2) - return (-1); - } else { - result = strcmp((const char *)u8s1, (const char *)u8s2); - if (result != 0) - return (result); - } - } - - /* - * We compared until the end of either or both strings. - * - * If we reached to or went over the ends for the both, that means - * they are the same. - * - * If we reached only one end, that means the other string has - * something which then can be used to determine the return value. - */ - if (s1 >= s1last) { - if (s2 >= s2last) - return (0); - return (-1); - } - return (1); -} - -/* - * The u8_strcmp() function compares two UTF-8 strings quite similar to - * the strcmp(). For the comparison, however, Unicode Normalization specific - * equivalency and Unicode simple case conversion mappings based equivalency - * can be requested and checked against. - */ -int -u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, - int *errnum) -{ - int f; - size_t n1; - size_t n2; - - *errnum = 0; - - /* - * Check on the requested Unicode version, case conversion, and - * normalization flag values. - */ - - if (uv > U8_UNICODE_LATEST) { - *errnum = ERANGE; - uv = U8_UNICODE_LATEST; - } - - if (flag == 0) { - flag = U8_STRCMP_CS; - } else { - f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | - U8_STRCMP_CI_LOWER); - if (f == 0) { - flag |= U8_STRCMP_CS; - } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && - f != U8_STRCMP_CI_LOWER) { - *errnum = EBADF; - flag = U8_STRCMP_CS; - } - - f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); - if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && - f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { - *errnum = EBADF; - flag = U8_STRCMP_CS; - } - } - - if (flag == U8_STRCMP_CS) { - return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); - } - - n1 = strlen(s1); - n2 = strlen(s2); - if (n != 0) { - if (n < n1) - n1 = n; - if (n < n2) - n2 = n; - } - - /* - * Simple case conversion can be done much faster and so we do - * them separately here. - */ - if (flag == U8_STRCMP_CI_UPPER) { - return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, - n1, n2, B_TRUE, errnum)); - } else if (flag == U8_STRCMP_CI_LOWER) { - return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, - n1, n2, B_FALSE, errnum)); - } - - return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, - flag, errnum)); -} - -size_t -u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, - int flag, size_t unicode_version, int *errnum) -{ - int f; - int sz; - uchar_t *ib; - uchar_t *ibtail; - uchar_t *ob; - uchar_t *obtail; - boolean_t do_not_ignore_null; - boolean_t do_not_ignore_invalid; - boolean_t is_it_toupper; - boolean_t is_it_tolower; - boolean_t canonical_decomposition; - boolean_t compatibility_decomposition; - boolean_t canonical_composition; - size_t ret_val; - size_t i; - size_t j; - uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; - u8_normalization_states_t state; - - if (unicode_version > U8_UNICODE_LATEST) { - *errnum = ERANGE; - return ((size_t)-1); - } - - f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); - if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { - *errnum = EBADF; - return ((size_t)-1); - } - - f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); - if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && - f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { - *errnum = EBADF; - return ((size_t)-1); - } - - if (inarray == NULL || *inlen == 0) - return (0); - - if (outarray == NULL) { - *errnum = E2BIG; - return ((size_t)-1); - } - - ib = (uchar_t *)inarray; - ob = (uchar_t *)outarray; - ibtail = ib + *inlen; - obtail = ob + *outlen; - - do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); - do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); - is_it_toupper = flag & U8_TEXTPREP_TOUPPER; - is_it_tolower = flag & U8_TEXTPREP_TOLOWER; - - ret_val = 0; - - /* - * If we don't have a normalization flag set, we do the simple case - * conversion based text preparation separately below. Text - * preparation involving Normalization will be done in the false task - * block, again, separately since it will take much more time and - * resource than doing simple case conversions. - */ - if (f == 0) { - while (ib < ibtail) { - if (*ib == '\0' && do_not_ignore_null) - break; - - sz = u8_number_of_bytes[*ib]; - - if (sz < 0) { - if (do_not_ignore_invalid) { - *errnum = EILSEQ; - ret_val = (size_t)-1; - break; - } - - sz = 1; - ret_val++; - } - - if (sz == 1) { - if (ob >= obtail) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - if (is_it_toupper) - *ob = U8_ASCII_TOUPPER(*ib); - else if (is_it_tolower) - *ob = U8_ASCII_TOLOWER(*ib); - else - *ob = *ib; - ib++; - ob++; - } else if ((ib + sz) > ibtail) { - if (do_not_ignore_invalid) { - *errnum = EINVAL; - ret_val = (size_t)-1; - break; - } - - if ((obtail - ob) < (ibtail - ib)) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - /* - * We treat the remaining incomplete character - * bytes as a character. - */ - ret_val++; - - while (ib < ibtail) - *ob++ = *ib++; - } else { - if (is_it_toupper || is_it_tolower) { - i = do_case_conv(unicode_version, u8s, - ib, sz, is_it_toupper); - - if ((obtail - ob) < i) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - ib += sz; - - for (sz = 0; sz < i; sz++) - *ob++ = u8s[sz]; - } else { - if ((obtail - ob) < sz) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - for (i = 0; i < sz; i++) - *ob++ = *ib++; - } - } - } - } else { - canonical_decomposition = flag & U8_CANON_DECOMP; - compatibility_decomposition = flag & U8_COMPAT_DECOMP; - canonical_composition = flag & U8_CANON_COMP; - - while (ib < ibtail) { - if (*ib == '\0' && do_not_ignore_null) - break; - - /* - * If the current character is a 7-bit ASCII - * character and it is the last character, or, - * if the current character is a 7-bit ASCII - * character and the next character is also a 7-bit - * ASCII character, then, we copy over this - * character without going through collect_a_seq(). - * - * In any other cases, we need to look further with - * the collect_a_seq() function. - */ - if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || - ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { - if (ob >= obtail) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - if (is_it_toupper) - *ob = U8_ASCII_TOUPPER(*ib); - else if (is_it_tolower) - *ob = U8_ASCII_TOLOWER(*ib); - else - *ob = *ib; - ib++; - ob++; - } else { - *errnum = 0; - state = U8_STATE_START; - - j = collect_a_seq(unicode_version, u8s, - &ib, ibtail, - is_it_toupper, - is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, - errnum, &state); - - if (*errnum && do_not_ignore_invalid) { - ret_val = (size_t)-1; - break; - } - - if ((obtail - ob) < j) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - for (i = 0; i < j; i++) - *ob++ = u8s[i]; - } - } - } - - *inlen = ibtail - ib; - *outlen = obtail - ob; - - return (ret_val); -} From cf06f2f58fe4836effcbd1bde54e9d063da23f50 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 11 Dec 2008 15:34:32 -0800 Subject: [PATCH 2/2] Whoops put these back --- lib/libspl/include/sys/list.h | 67 + lib/libspl/include/sys/list_impl.h | 53 + lib/libspl/list.c | 245 ++++ lib/libspl/mkdirp.c | 212 +++ lib/libspl/strlcat.c | 59 + lib/libspl/strlcpy.c | 55 + lib/libspl/strnlen.c | 47 + lib/libspl/u8_textprep.c | 2133 ++++++++++++++++++++++++++++ 8 files changed, 2871 insertions(+) create mode 100644 lib/libspl/include/sys/list.h create mode 100644 lib/libspl/include/sys/list_impl.h create mode 100644 lib/libspl/list.c create mode 100644 lib/libspl/mkdirp.c create mode 100644 lib/libspl/strlcat.c create mode 100644 lib/libspl/strlcpy.c create mode 100644 lib/libspl/strnlen.c create mode 100644 lib/libspl/u8_textprep.c diff --git a/lib/libspl/include/sys/list.h b/lib/libspl/include/sys/list.h new file mode 100644 index 0000000000..8339b6226d --- /dev/null +++ b/lib/libspl/include/sys/list.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_H +#define _SYS_LIST_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct list_node list_node_t; +typedef struct list list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void *list_remove_head(list_t *); +void *list_remove_tail(list_t *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); +int list_is_empty(list_t *); + +void list_link_init(list_node_t *); +void list_link_replace(list_node_t *, list_node_t *); + +int list_link_active(list_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_H */ diff --git a/lib/libspl/include/sys/list_impl.h b/lib/libspl/include/sys/list_impl.h new file mode 100644 index 0000000000..9c42f88320 --- /dev/null +++ b/lib/libspl/include/sys/list_impl.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_IMPL_H +#define _SYS_LIST_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +}; + +struct list { + size_t list_size; + size_t list_offset; + struct list_node list_head; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_IMPL_H */ diff --git a/lib/libspl/list.c b/lib/libspl/list.c new file mode 100644 index 0000000000..e8db13a5cf --- /dev/null +++ b/lib/libspl/list.c @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Generic doubly-linked list implementation + */ + +#include +#include +#include +#include +#include + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = (node); \ + lnew->list_next = (node)->list_next; \ + (node)->list_next->list_prev = lnew; \ + (node)->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = (node); \ + lnew->list_prev = (node)->list_prev; \ + (node)->list_prev->list_next = lnew; \ + (node)->list_prev = lnew; \ +} + +#define list_remove_node(node) \ + (node)->list_prev->list_next = (node)->list_next; \ + (node)->list_next->list_prev = (node)->list_prev; \ + (node)->list_next = (node)->list_prev = NULL + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); + list_remove_node(lold); +} + +void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + list_remove_node(head); + return (list_object(list, head)); +} + +void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + list_remove_node(tail); + return (list_object(list, tail)); +} + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +void +list_link_replace(list_node_t *lold, list_node_t *lnew) +{ + ASSERT(list_link_active(lold)); + ASSERT(!list_link_active(lnew)); + + lnew->list_next = lold->list_next; + lnew->list_prev = lold->list_prev; + lold->list_prev->list_next = lnew; + lold->list_next->list_prev = lnew; + lold->list_next = lold->list_prev = NULL; +} + +void +list_link_init(list_node_t *link) +{ + link->list_next = NULL; + link->list_prev = NULL; +} + +int +list_link_active(list_node_t *link) +{ + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/lib/libspl/mkdirp.c b/lib/libspl/mkdirp.c new file mode 100644 index 0000000000..9c81f2a0b8 --- /dev/null +++ b/lib/libspl/mkdirp.c @@ -0,0 +1,212 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Creates directory and it's parents if the parents do not + * exist yet. + * + * Returns -1 if fails for reasons other than non-existing + * parents. + * Does NOT simplify pathnames with . or .. in them. + */ + +#include +#include +#include +#include +#include +#include +#include + +static char *simplify(const char *str); + +int +mkdirp(const char *d, mode_t mode) +{ + char *endptr, *ptr, *slash, *str; + + str = simplify(d); + + /* If space couldn't be allocated for the simplified names, return. */ + + if (str == NULL) + return (-1); + + /* Try to make the directory */ + + if (mkdir(str, mode) == 0) { + free(str); + return (0); + } + if (errno != ENOENT) { + free(str); + return (-1); + } + endptr = strrchr(str, '\0'); + slash = strrchr(str, '/'); + + /* Search upward for the non-existing parent */ + + while (slash != NULL) { + + ptr = slash; + *ptr = '\0'; + + /* If reached an existing parent, break */ + + if (access(str, F_OK) == 0) + break; + + /* If non-existing parent */ + + else { + slash = strrchr(str, '/'); + + /* If under / or current directory, make it. */ + + if (slash == NULL || slash == str) { + if (mkdir(str, mode) != 0 && errno != EEXIST) { + free(str); + return (-1); + } + break; + } + } + } + + /* Create directories starting from upmost non-existing parent */ + + while ((ptr = strchr(str, '\0')) != endptr) { + *ptr = '/'; + if (mkdir(str, mode) != 0 && errno != EEXIST) { + /* + * If the mkdir fails because str already + * exists (EEXIST), then str has the form + * "existing-dir/..", and this is really + * ok. (Remember, this loop is creating the + * portion of the path that didn't exist) + */ + free(str); + return (-1); + } + } + free(str); + return (0); +} + +/* + * simplify - given a pathname, simplify that path by removing + * duplicate contiguous slashes. + * + * A simplified copy of the argument is returned to the + * caller, or NULL is returned on error. + * + * The caller should handle error reporting based upon the + * returned vlaue, and should free the returned value, + * when appropriate. + */ + +static char * +simplify(const char *str) +{ + int i; + size_t mbPathlen; /* length of multi-byte path */ + size_t wcPathlen; /* length of wide-character path */ + wchar_t *wptr; /* scratch pointer */ + wchar_t *wcPath; /* wide-character version of the path */ + char *mbPath; /* The copy fo the path to be returned */ + + /* + * bail out if there is nothing there. + */ + + if (!str) + return (NULL); + + /* + * Get a copy of the argument. + */ + + if ((mbPath = strdup(str)) == NULL) { + return (NULL); + } + + /* + * convert the multi-byte version of the path to a + * wide-character rendering, for doing our figuring. + */ + + mbPathlen = strlen(mbPath); + + if ((wcPath = calloc(sizeof (wchar_t), mbPathlen+1)) == NULL) { + free(mbPath); + return (NULL); + } + + if ((wcPathlen = mbstowcs(wcPath, mbPath, mbPathlen)) == (size_t)-1) { + free(mbPath); + free(wcPath); + return (NULL); + } + + /* + * remove duplicate slashes first ("//../" -> "/") + */ + + for (wptr = wcPath, i = 0; i < wcPathlen; i++) { + *wptr++ = wcPath[i]; + + if (wcPath[i] == '/') { + i++; + + while (wcPath[i] == '/') { + i++; + } + + i--; + } + } + + *wptr = '\0'; + + /* + * now convert back to the multi-byte format. + */ + + if (wcstombs(mbPath, wcPath, mbPathlen) == (size_t)-1) { + free(mbPath); + free(wcPath); + return (NULL); + } + + free(wcPath); + return (mbPath); +} diff --git a/lib/libspl/strlcat.c b/lib/libspl/strlcat.c new file mode 100644 index 0000000000..07d1403dde --- /dev/null +++ b/lib/libspl/strlcat.c @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "lint.h" +#include +#include + +/* + * Appends src to the dstsize buffer at dst. The append will never + * overflow the destination buffer and the buffer will always be null + * terminated. Never reference beyond &dst[dstsize-1] when computing + * the length of the pre-existing string. + */ + +size_t +strlcat(char *dst, const char *src, size_t dstsize) +{ + char *df = dst; + size_t left = dstsize; + size_t l1; + size_t l2 = strlen(src); + size_t copied; + + while (left-- != 0 && *df != '\0') + df++; + l1 = df - dst; + if (dstsize == l1) + return (l1 + l2); + + copied = l1 + l2 >= dstsize ? dstsize - l1 - 1 : l2; + (void) memcpy(dst + l1, src, copied); + dst[l1+copied] = '\0'; + return (l1 + l2); +} diff --git a/lib/libspl/strlcpy.c b/lib/libspl/strlcpy.c new file mode 100644 index 0000000000..7a8009b893 --- /dev/null +++ b/lib/libspl/strlcpy.c @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "lint.h" +#include +#include + +/* + * Copies src to the dstsize buffer at dst. The copy will never + * overflow the destination buffer and the buffer will always be null + * terminated. + */ + +size_t +strlcpy(char *dst, const char *src, size_t len) +{ + size_t slen = strlen(src); + size_t copied; + + if (len == 0) + return (slen); + + if (slen >= len) + copied = len - 1; + else + copied = slen; + (void) memcpy(dst, src, copied); + dst[copied] = '\0'; + return (slen); +} diff --git a/lib/libspl/strnlen.c b/lib/libspl/strnlen.c new file mode 100644 index 0000000000..605245b6bb --- /dev/null +++ b/lib/libspl/strnlen.c @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. + * All rights reserved. Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "lint.h" +#include +#include + +/* + * Returns the number of non-NULL bytes in string argument, + * but not more than maxlen. Does not look past str + maxlen. + */ +size_t +strnlen(const char *str, size_t maxlen) +{ + const char *ptr; + + ptr = memchr(str, 0, maxlen); + if (ptr == NULL) + return (maxlen); + + return (ptr - str); +} diff --git a/lib/libspl/u8_textprep.c b/lib/libspl/u8_textprep.c new file mode 100644 index 0000000000..35cafbaa9a --- /dev/null +++ b/lib/libspl/u8_textprep.c @@ -0,0 +1,2133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). + * + * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), + * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also + * the section 3C man pages. + * Interface stability: Committed. + */ + +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#else +#include +#include +#endif /* _KERNEL */ +#include +#include +#include + + +/* The maximum possible number of bytes in a UTF-8 character. */ +#define U8_MB_CUR_MAX (4) + +/* + * The maximum number of bytes needed for a UTF-8 character to cover + * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. + */ +#define U8_MAX_BYTES_UCS2 (3) + +/* The maximum possible number of bytes in a Stream-Safe Text. */ +#define U8_STREAM_SAFE_TEXT_MAX (128) + +/* + * The maximum number of characters in a combining/conjoining sequence and + * the actual upperbound limit of a combining/conjoining sequence. + */ +#define U8_MAX_CHARS_A_SEQ (32) +#define U8_UPPER_LIMIT_IN_A_SEQ (31) + +/* The combining class value for Starter. */ +#define U8_COMBINING_CLASS_STARTER (0) + +/* + * Some Hangul related macros at below. + * + * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, + * Vowels, and optional Trailing consonants in Unicode scalar values. + * + * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not + * the actual U+11A8. This is due to that the trailing consonant is optional + * and thus we are doing a pre-calculation of subtracting one. + * + * Each of 19 modern leading consonants has total 588 possible syllables since + * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for + * no trailing consonant case, i.e., 21 x 28 = 588. + * + * We also have bunch of Hangul related macros at below. Please bear in mind + * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is + * a Hangul Jamo or not but the value does not guarantee that it is a Hangul + * Jamo; it just guarantee that it will be most likely. + */ +#define U8_HANGUL_SYL_FIRST (0xAC00U) +#define U8_HANGUL_SYL_LAST (0xD7A3U) + +#define U8_HANGUL_JAMO_L_FIRST (0x1100U) +#define U8_HANGUL_JAMO_L_LAST (0x1112U) +#define U8_HANGUL_JAMO_V_FIRST (0x1161U) +#define U8_HANGUL_JAMO_V_LAST (0x1175U) +#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) +#define U8_HANGUL_JAMO_T_LAST (0x11C2U) + +#define U8_HANGUL_V_COUNT (21) +#define U8_HANGUL_VT_COUNT (588) +#define U8_HANGUL_T_COUNT (28) + +#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) + +#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ + (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ + (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ + (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); + +#define U8_HANGUL_JAMO_L(u) \ + ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) + +#define U8_HANGUL_JAMO_V(u) \ + ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) + +#define U8_HANGUL_JAMO_T(u) \ + ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) + +#define U8_HANGUL_JAMO(u) \ + ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) + +#define U8_HANGUL_SYLLABLE(u) \ + ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) + +#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ + ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) + +#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ + ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) + +/* The types of decomposition mappings. */ +#define U8_DECOMP_BOTH (0xF5U) +#define U8_DECOMP_CANONICAL (0xF6U) + +/* The indicator for 16-bit table. */ +#define U8_16BIT_TABLE_INDICATOR (0x8000U) + +/* The following are some convenience macros. */ +#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ + (u) = (((uint32_t)(b1) & 0x0F) << 12) | \ + (((uint32_t)(b2) & 0x3F) << 6) | \ + ((uint32_t)(b3) & 0x3F); + +#define U8_SIMPLE_SWAP(a, b, t) \ + (t) = (a); \ + (a) = (b); \ + (b) = (t); + +#define U8_ASCII_TOUPPER(c) \ + (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) + +#define U8_ASCII_TOLOWER(c) \ + (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) + +#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) +/* + * The following macro assumes that the two characters that are to be + * swapped are adjacent to each other and 'a' comes before 'b'. + * + * If the assumptions are not met, then, the macro will fail. + */ +#define U8_SWAP_COMB_MARKS(a, b) \ + for (k = 0; k < disp[(a)]; k++) \ + u8t[k] = u8s[start[(a)] + k]; \ + for (k = 0; k < disp[(b)]; k++) \ + u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ + start[(b)] = start[(a)] + disp[(b)]; \ + for (k = 0; k < disp[(a)]; k++) \ + u8s[start[(b)] + k] = u8t[k]; \ + U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ + U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); + +/* The possible states during normalization. */ +typedef enum { + U8_STATE_START = 0, + U8_STATE_HANGUL_L = 1, + U8_STATE_HANGUL_LV = 2, + U8_STATE_HANGUL_LVT = 3, + U8_STATE_HANGUL_V = 4, + U8_STATE_HANGUL_T = 5, + U8_STATE_COMBINING_MARK = 6 +} u8_normalization_states_t; + +/* + * The three vectors at below are used to check bytes of a given UTF-8 + * character are valid and not containing any malformed byte values. + * + * We used to have a quite relaxed UTF-8 binary representation but then there + * was some security related issues and so the Unicode Consortium defined + * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it + * one more time at the Unicode 3.2. The following three tables are based on + * that. + */ + +#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) + +#define I_ U8_ILLEGAL_CHAR +#define O_ U8_OUT_OF_RANGE_CHAR + +const int8_t u8_number_of_bytes[0x100] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ + I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + +/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ + 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, +}; + +#undef I_ +#undef O_ + +const uint8_t u8_valid_min_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* C8 C9 CA CB CC CD CE CF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* D8 D9 DA DB DC DD DE DF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* E8 E9 EA EB EC ED EE EF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +const uint8_t u8_valid_max_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* C8 C9 CA CB CC CD CE CF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* D8 D9 DA DB DC DD DE DF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* E8 E9 EA EB EC ED EE EF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + + +/* + * The u8_validate() validates on the given UTF-8 character string and + * calculate the byte length. It is quite similar to mblen(3C) except that + * this will validate against the list of characters if required and + * specific to UTF-8 and Unicode. + */ +int +u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) +{ + uchar_t *ib; + uchar_t *ibtail; + uchar_t **p; + uchar_t *s1; + uchar_t *s2; + uchar_t f; + int sz; + size_t i; + int ret_val; + boolean_t second; + boolean_t no_need_to_validate_entire; + boolean_t check_additional; + boolean_t validate_ucs2_range_only; + + if (! u8str) + return (0); + + ib = (uchar_t *)u8str; + ibtail = ib + n; + + ret_val = 0; + + no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); + check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; + validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; + + while (ib < ibtail) { + /* + * The first byte of a UTF-8 character tells how many + * bytes will follow for the character. If the first byte + * is an illegal byte value or out of range value, we just + * return -1 with an appropriate error number. + */ + sz = u8_number_of_bytes[*ib]; + if (sz == U8_ILLEGAL_CHAR) { + *errnum = EILSEQ; + return (-1); + } + + if (sz == U8_OUT_OF_RANGE_CHAR || + (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { + *errnum = ERANGE; + return (-1); + } + + /* + * If we don't have enough bytes to check on, that's also + * an error. As you can see, we give illegal byte sequence + * checking higher priority then EINVAL cases. + */ + if ((ibtail - ib) < sz) { + *errnum = EINVAL; + return (-1); + } + + if (sz == 1) { + ib++; + ret_val++; + } else { + /* + * Check on the multi-byte UTF-8 character. For more + * details on this, see comment added for the used + * data structures at the beginning of the file. + */ + f = *ib++; + ret_val++; + second = B_TRUE; + for (i = 1; i < sz; i++) { + if (second) { + if (*ib < u8_valid_min_2nd_byte[f] || + *ib > u8_valid_max_2nd_byte[f]) { + *errnum = EILSEQ; + return (-1); + } + second = B_FALSE; + } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { + *errnum = EILSEQ; + return (-1); + } + ib++; + ret_val++; + } + } + + if (check_additional) { + for (p = (uchar_t **)list, i = 0; p[i]; i++) { + s1 = ib - sz; + s2 = p[i]; + while (s1 < ib) { + if (*s1 != *s2 || *s2 == '\0') + break; + s1++; + s2++; + } + + if (s1 >= ib && *s2 == '\0') { + *errnum = EBADF; + return (-1); + } + } + } + + if (no_need_to_validate_entire) + break; + } + + return (ret_val); +} + +/* + * The do_case_conv() looks at the mapping tables and returns found + * bytes if any. If not found, the input bytes are returned. The function + * always terminate the return bytes with a null character assuming that + * there are plenty of room to do so. + * + * The case conversions are simple case conversions mapping a character to + * another character as specified in the Unicode data. The byte size of + * the mapped character could be different from that of the input character. + * + * The return value is the byte length of the returned character excluding + * the terminating null byte. + */ +static size_t +do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) +{ + size_t i; + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + + /* + * At this point, the only possible values for sz are 2, 3, and 4. + * The u8s should point to a vector that is well beyond the size of + * 5 bytes. + */ + if (sz == 2) { + b3 = u8s[0] = s[0]; + b4 = u8s[1] = s[1]; + } else if (sz == 3) { + b2 = u8s[0] = s[0]; + b3 = u8s[1] = s[1]; + b4 = u8s[2] = s[2]; + } else if (sz == 4) { + b1 = u8s[0] = s[0]; + b2 = u8s[1] = s[1]; + b3 = u8s[2] = s[2]; + b4 = u8s[3] = s[3]; + } else { + /* This is not possible but just in case as a fallback. */ + if (is_it_toupper) + *u8s = U8_ASCII_TOUPPER(*s); + else + *u8s = U8_ASCII_TOLOWER(*s); + u8s[1] = '\0'; + + return (1); + } + u8s[sz] = '\0'; + + /* + * Let's find out if we have a corresponding character. + */ + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b2 = u8_case_common_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + if (is_it_toupper) { + b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; + + /* Either there is no match or an error at the table. */ + if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) + return ((size_t)sz); + + b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; + } else { + b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; + + if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) + return ((size_t)sz); + + b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; + } + + /* + * If i is still zero, that means there is no corresponding character. + */ + if (i == 0) + return ((size_t)sz); + + u8s[i] = '\0'; + + return (i); +} + +/* + * The do_case_compare() function compares the two input strings, s1 and s2, + * one character at a time doing case conversions if applicable and return + * the comparison result as like strcmp(). + * + * Since, in empirical sense, most of text data are 7-bit ASCII characters, + * we treat the 7-bit ASCII characters as a special case trying to yield + * faster processing time. + */ +static int +do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, + size_t n2, boolean_t is_it_toupper, int *errnum) +{ + int f; + int sz1; + int sz2; + size_t j; + size_t i1; + size_t i2; + uchar_t u8s1[U8_MB_CUR_MAX + 1]; + uchar_t u8s2[U8_MB_CUR_MAX + 1]; + + i1 = i2 = 0; + while (i1 < n1 && i2 < n2) { + /* + * Find out what would be the byte length for this UTF-8 + * character at string s1 and also find out if this is + * an illegal start byte or not and if so, issue a proper + * error number and yet treat this byte as a character. + */ + sz1 = u8_number_of_bytes[*s1]; + if (sz1 < 0) { + *errnum = EILSEQ; + sz1 = 1; + } + + /* + * For 7-bit ASCII characters mainly, we do a quick case + * conversion right at here. + * + * If we don't have enough bytes for this character, issue + * an EINVAL error and use what are available. + * + * If we have enough bytes, find out if there is + * a corresponding uppercase character and if so, copy over + * the bytes for a comparison later. If there is no + * corresponding uppercase character, then, use what we have + * for the comparison. + */ + if (sz1 == 1) { + if (is_it_toupper) + u8s1[0] = U8_ASCII_TOUPPER(*s1); + else + u8s1[0] = U8_ASCII_TOLOWER(*s1); + s1++; + u8s1[1] = '\0'; + } else if ((i1 + sz1) > n1) { + *errnum = EINVAL; + for (j = 0; (i1 + j) < n1; ) + u8s1[j++] = *s1++; + u8s1[j] = '\0'; + } else { + (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); + s1 += sz1; + } + + /* Do the same for the string s2. */ + sz2 = u8_number_of_bytes[*s2]; + if (sz2 < 0) { + *errnum = EILSEQ; + sz2 = 1; + } + + if (sz2 == 1) { + if (is_it_toupper) + u8s2[0] = U8_ASCII_TOUPPER(*s2); + else + u8s2[0] = U8_ASCII_TOLOWER(*s2); + s2++; + u8s2[1] = '\0'; + } else if ((i2 + sz2) > n2) { + *errnum = EINVAL; + for (j = 0; (i2 + j) < n2; ) + u8s2[j++] = *s2++; + u8s2[j] = '\0'; + } else { + (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); + s2 += sz2; + } + + /* Now compare the two characters. */ + if (sz1 == 1 && sz2 == 1) { + if (*u8s1 > *u8s2) + return (1); + if (*u8s1 < *u8s2) + return (-1); + } else { + f = strcmp((const char *)u8s1, (const char *)u8s2); + if (f != 0) + return (f); + } + + /* + * They were the same. Let's move on to the next + * characters then. + */ + i1 += sz1; + i2 += sz2; + } + + /* + * We compared until the end of either or both strings. + * + * If we reached to or went over the ends for the both, that means + * they are the same. + * + * If we reached only one of the two ends, that means the other string + * has something which then the fact can be used to determine + * the return value. + */ + if (i1 >= n1) { + if (i2 >= n2) + return (0); + return (-1); + } + return (1); +} + +/* + * The combining_class() function checks on the given bytes and find out + * the corresponding Unicode combining class value. The return value 0 means + * it is a Starter. Any illegal UTF-8 character will also be treated as + * a Starter. + */ +static uchar_t +combining_class(size_t uv, uchar_t *s, size_t sz) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b4 = 0; + + if (sz == 1 || sz > 4) + return (0); + + if (sz == 2) { + b3 = s[0]; + b4 = s[1]; + } else if (sz == 3) { + b2 = s[0]; + b3 = s[1]; + b4 = s[2]; + } else if (sz == 4) { + b1 = s[0]; + b2 = s[1]; + b3 = s[2]; + b4 = s[3]; + } + + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + b2 = u8_combining_class_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + b3 = u8_combining_class_b3_tbl[uv][b2][b3]; + if (b3 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + return (u8_combining_class_b4_tbl[uv][b3][b4]); +} + +/* + * The do_decomp() function finds out a matching decomposition if any + * and return. If there is no match, the input bytes are copied and returned. + * The function also checks if there is a Hangul, decomposes it if necessary + * and returns. + * + * To save time, a single byte 7-bit ASCII character should be handled by + * the caller. + * + * The function returns the number of bytes returned sans always terminating + * the null byte. It will also return a state that will tell if there was + * a Hangul character decomposed which then will be used by the caller. + */ +static size_t +do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, + boolean_t canonical_decomposition, u8_normalization_states_t *state) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + size_t i; + uint32_t u1; + + if (sz == 2) { + b3 = u8s[0] = s[0]; + b4 = u8s[1] = s[1]; + u8s[2] = '\0'; + } else if (sz == 3) { + /* Convert it to a Unicode scalar value. */ + U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); + + /* + * If this is a Hangul syllable, we decompose it into + * a leading consonant, a vowel, and an optional trailing + * consonant and then return. + */ + if (U8_HANGUL_SYLLABLE(u1)) { + u1 -= U8_HANGUL_SYL_FIRST; + + b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; + b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) + / U8_HANGUL_T_COUNT; + b3 = u1 % U8_HANGUL_T_COUNT; + + U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); + U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); + if (b3) { + b3 += U8_HANGUL_JAMO_T_FIRST; + U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); + + u8s[9] = '\0'; + *state = U8_STATE_HANGUL_LVT; + return (9); + } + + u8s[6] = '\0'; + *state = U8_STATE_HANGUL_LV; + return (6); + } + + b2 = u8s[0] = s[0]; + b3 = u8s[1] = s[1]; + b4 = u8s[2] = s[2]; + u8s[3] = '\0'; + + /* + * If this is a Hangul Jamo, we know there is nothing + * further that we can decompose. + */ + if (U8_HANGUL_JAMO_L(u1)) { + *state = U8_STATE_HANGUL_L; + return (3); + } + + if (U8_HANGUL_JAMO_V(u1)) { + if (*state == U8_STATE_HANGUL_L) + *state = U8_STATE_HANGUL_LV; + else + *state = U8_STATE_HANGUL_V; + return (3); + } + + if (U8_HANGUL_JAMO_T(u1)) { + if (*state == U8_STATE_HANGUL_LV) + *state = U8_STATE_HANGUL_LVT; + else + *state = U8_STATE_HANGUL_T; + return (3); + } + } else if (sz == 4) { + b1 = u8s[0] = s[0]; + b2 = u8s[1] = s[1]; + b3 = u8s[2] = s[2]; + b4 = u8s[3] = s[3]; + u8s[4] = '\0'; + } else { + /* + * This is a fallback and should not happen if the function + * was called properly. + */ + u8s[0] = s[0]; + u8s[1] = '\0'; + *state = U8_STATE_START; + return (1); + } + + /* + * At this point, this rountine does not know what it would get. + * The caller should sort it out if the state isn't a Hangul one. + */ + *state = U8_STATE_START; + + /* Try to find matching decomposition mapping byte sequence. */ + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b2 = u8_decomp_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + /* + * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR + * which is 0x8000, this means we couldn't fit the mappings into + * the cardinality of a unsigned byte. + */ + if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { + b3_tbl -= U8_16BIT_TABLE_INDICATOR; + start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; + end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; + } else { + start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; + } + + /* This also means there wasn't any matching decomposition. */ + if (start_id >= end_id) + return ((size_t)sz); + + /* + * The final table for decomposition mappings has three types of + * byte sequences depending on whether a mapping is for compatibility + * decomposition, canonical decomposition, or both like the following: + * + * (1) Compatibility decomposition mappings: + * + * +---+---+-...-+---+ + * | B0| B1| ... | Bm| + * +---+---+-...-+---+ + * + * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). + * + * (2) Canonical decomposition mappings: + * + * +---+---+---+-...-+---+ + * | T | b0| b1| ... | bn| + * +---+---+---+-...-+---+ + * + * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). + * + * (3) Both mappings: + * + * +---+---+---+---+-...-+---+---+---+-...-+---+ + * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| + * +---+---+---+---+-...-+---+---+---+-...-+---+ + * + * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement + * byte, b0 to bn are canonical mapping bytes and B0 to Bm are + * compatibility mapping bytes. + * + * Note that compatibility decomposition means doing recursive + * decompositions using both compatibility decomposition mappings and + * canonical decomposition mappings. On the other hand, canonical + * decomposition means doing recursive decompositions using only + * canonical decomposition mappings. Since the table we have has gone + * through the recursions already, we do not need to do so during + * runtime, i.e., the table has been completely flattened out + * already. + */ + + b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; + + /* Get the type, T, of the byte sequence. */ + b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; + + /* + * If necessary, adjust start_id, end_id, or both. Note that if + * this is compatibility decomposition mapping, there is no + * adjustment. + */ + if (canonical_decomposition) { + /* Is the mapping only for compatibility decomposition? */ + if (b1 < U8_DECOMP_BOTH) + return ((size_t)sz); + + start_id++; + + if (b1 == U8_DECOMP_BOTH) { + end_id = start_id + + u8_decomp_final_tbl[uv][b3_base + start_id]; + start_id++; + } + } else { + /* + * Unless this is a compatibility decomposition mapping, + * we adjust the start_id. + */ + if (b1 == U8_DECOMP_BOTH) { + start_id++; + start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; + } else if (b1 == U8_DECOMP_CANONICAL) { + start_id++; + } + } + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; + u8s[i] = '\0'; + + return (i); +} + +/* + * The find_composition_start() function uses the character bytes given and + * find out the matching composition mappings if any and return the address + * to the composition mappings as explained in the do_composition(). + */ +static uchar_t * +find_composition_start(size_t uv, uchar_t *s, size_t sz) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + + if (sz == 1) { + b4 = s[0]; + } else if (sz == 2) { + b3 = s[0]; + b4 = s[1]; + } else if (sz == 3) { + b2 = s[0]; + b3 = s[1]; + b4 = s[2]; + } else if (sz == 4) { + b1 = s[0]; + b2 = s[1]; + b3 = s[2]; + b4 = s[3]; + } else { + /* + * This is a fallback and should not happen if the function + * was called properly. + */ + return (NULL); + } + + b1 = u8_composition_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + b2 = u8_composition_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { + b3_tbl -= U8_16BIT_TABLE_INDICATOR; + start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; + end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; + } else { + start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; + } + + if (start_id >= end_id) + return (NULL); + + b3_base = u8_composition_b3_tbl[uv][b2][b3].base; + + return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); +} + +/* + * The blocked() function checks on the combining class values of previous + * characters in this sequence and return whether it is blocked or not. + */ +static boolean_t +blocked(uchar_t *comb_class, size_t last) +{ + uchar_t my_comb_class; + size_t i; + + my_comb_class = comb_class[last]; + for (i = 1; i < last; i++) + if (comb_class[i] >= my_comb_class || + comb_class[i] == U8_COMBINING_CLASS_STARTER) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The do_composition() reads the character string pointed by 's' and + * do necessary canonical composition and then copy over the result back to + * the 's'. + * + * The input argument 's' cannot contain more than 32 characters. + */ +static size_t +do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, + uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) +{ + uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t tc[U8_MB_CUR_MAX]; + uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; + size_t saved_marks_count; + uchar_t *p; + uchar_t *saved_p; + uchar_t *q; + size_t i; + size_t saved_i; + size_t j; + size_t k; + size_t l; + size_t C; + size_t saved_l; + size_t size; + uint32_t u1; + uint32_t u2; + boolean_t match_not_found = B_TRUE; + + /* + * This should never happen unless the callers are doing some strange + * and unexpected things. + * + * The "last" is the index pointing to the last character not last + 1. + */ + if (last >= U8_MAX_CHARS_A_SEQ) + last = U8_UPPER_LIMIT_IN_A_SEQ; + + for (i = l = 0; i <= last; i++) { + /* + * The last or any non-Starters at the beginning, we don't + * have any chance to do composition and so we just copy them + * to the temporary buffer. + */ + if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { +SAVE_THE_CHAR: + p = s + start[i]; + size = disp[i]; + for (k = 0; k < size; k++) + t[l++] = *p++; + continue; + } + + /* + * If this could be a start of Hangul Jamos, then, we try to + * conjoin them. + */ + if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { + U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], + s[start[i] + 1], s[start[i] + 2]); + U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], + s[start[i] + 4], s[start[i] + 5]); + + if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { + u1 -= U8_HANGUL_JAMO_L_FIRST; + u2 -= U8_HANGUL_JAMO_V_FIRST; + u1 = U8_HANGUL_SYL_FIRST + + (u1 * U8_HANGUL_V_COUNT + u2) * + U8_HANGUL_T_COUNT; + + i += 2; + if (i <= last) { + U8_PUT_3BYTES_INTO_UTF32(u2, + s[start[i]], s[start[i] + 1], + s[start[i] + 2]); + + if (U8_HANGUL_JAMO_T(u2)) { + u1 += u2 - + U8_HANGUL_JAMO_T_FIRST; + i++; + } + } + + U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); + i--; + l += 3; + continue; + } + } + + /* + * Let's then find out if this Starter has composition + * mapping. + */ + p = find_composition_start(uv, s + start[i], disp[i]); + if (p == NULL) + goto SAVE_THE_CHAR; + + /* + * We have a Starter with composition mapping and the next + * character is a non-Starter. Let's try to find out if + * we can do composition. + */ + + saved_p = p; + saved_i = i; + saved_l = l; + saved_marks_count = 0; + +TRY_THE_NEXT_MARK: + q = s + start[++i]; + size = disp[i]; + + /* + * The next for() loop compares the non-Starter pointed by + * 'q' with the possible (joinable) characters pointed by 'p'. + * + * The composition final table entry pointed by the 'p' + * looks like the following: + * + * +---+---+---+-...-+---+---+---+---+-...-+---+---+ + * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | + * +---+---+---+-...-+---+---+---+---+-...-+---+---+ + * + * where C is the count byte indicating the number of + * mapping pairs where each pair would be look like + * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second + * character of a canonical decomposition and the B0-Bm are + * the bytes of a matching composite character. The F is + * a filler byte after each character as the separator. + */ + + match_not_found = B_TRUE; + + for (C = *p++; C > 0; C--) { + for (k = 0; k < size; p++, k++) + if (*p != q[k]) + break; + + /* Have we found it? */ + if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { + match_not_found = B_FALSE; + + l = saved_l; + + while (*++p != U8_TBL_ELEMENT_FILLER) + t[l++] = *p; + + break; + } + + /* We didn't find; skip to the next pair. */ + if (*p != U8_TBL_ELEMENT_FILLER) + while (*++p != U8_TBL_ELEMENT_FILLER) + ; + while (*++p != U8_TBL_ELEMENT_FILLER) + ; + p++; + } + + /* + * If there was no match, we will need to save the combining + * mark for later appending. After that, if the next one + * is a non-Starter and not blocked, then, we try once + * again to do composition with the next non-Starter. + * + * If there was no match and this was a Starter, then, + * this is a new start. + * + * If there was a match and a composition done and we have + * more to check on, then, we retrieve a new composition final + * table entry for the composite and then try to do the + * composition again. + */ + + if (match_not_found) { + if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { + i--; + goto SAVE_THE_CHAR; + } + + saved_marks[saved_marks_count++] = i; + } + + if (saved_l == l) { + while (i < last) { + if (blocked(comb_class, i + 1)) + saved_marks[saved_marks_count++] = ++i; + else + break; + } + if (i < last) { + p = saved_p; + goto TRY_THE_NEXT_MARK; + } + } else if (i < last) { + p = find_composition_start(uv, t + saved_l, + l - saved_l); + if (p != NULL) { + saved_p = p; + goto TRY_THE_NEXT_MARK; + } + } + + /* + * There is no more composition possible. + * + * If there was no composition what so ever then we copy + * over the original Starter and then append any non-Starters + * remaining at the target string sequentially after that. + */ + + if (saved_l == l) { + p = s + start[saved_i]; + size = disp[saved_i]; + for (j = 0; j < size; j++) + t[l++] = *p++; + } + + for (k = 0; k < saved_marks_count; k++) { + p = s + start[saved_marks[k]]; + size = disp[saved_marks[k]]; + for (j = 0; j < size; j++) + t[l++] = *p++; + } + } + + /* + * If the last character is a Starter and if we have a character + * (possibly another Starter) that can be turned into a composite, + * we do so and we do so until there is no more of composition + * possible. + */ + if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { + p = *os; + saved_l = l - disp[last]; + + while (p < oslast) { + size = u8_number_of_bytes[*p]; + if (size <= 1 || (p + size) > oslast) + break; + + saved_p = p; + + for (i = 0; i < size; i++) + tc[i] = *p++; + + q = find_composition_start(uv, t + saved_l, + l - saved_l); + if (q == NULL) { + p = saved_p; + break; + } + + match_not_found = B_TRUE; + + for (C = *q++; C > 0; C--) { + for (k = 0; k < size; q++, k++) + if (*q != tc[k]) + break; + + if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { + match_not_found = B_FALSE; + + l = saved_l; + + while (*++q != U8_TBL_ELEMENT_FILLER) { + /* + * This is practically + * impossible but we don't + * want to take any chances. + */ + if (l >= + U8_STREAM_SAFE_TEXT_MAX) { + p = saved_p; + goto SAFE_RETURN; + } + t[l++] = *q; + } + + break; + } + + if (*q != U8_TBL_ELEMENT_FILLER) + while (*++q != U8_TBL_ELEMENT_FILLER) + ; + while (*++q != U8_TBL_ELEMENT_FILLER) + ; + q++; + } + + if (match_not_found) { + p = saved_p; + break; + } + } +SAFE_RETURN: + *os = p; + } + + /* + * Now we copy over the temporary string to the target string. + * Since composition always reduces the number of characters or + * the number of characters stay, we don't need to worry about + * the buffer overflow here. + */ + for (i = 0; i < l; i++) + s[i] = t[i]; + s[l] = '\0'; + + return (l); +} + +/* + * The collect_a_seq() function checks on the given string s, collect + * a sequence of characters at u8s, and return the sequence. While it collects + * a sequence, it also applies case conversion, canonical or compatibility + * decomposition, canonical decomposition, or some or all of them and + * in that order. + * + * The collected sequence cannot be bigger than 32 characters since if + * it is having more than 31 characters, the sequence will be terminated + * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into + * a Stream-Safe Text. The collected sequence is always terminated with + * a null byte and the return value is the byte length of the sequence + * including 0. The return value does not include the terminating + * null byte. + */ +static size_t +collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, + boolean_t is_it_toupper, + boolean_t is_it_tolower, + boolean_t canonical_decomposition, + boolean_t compatibility_decomposition, + boolean_t canonical_composition, + int *errnum, u8_normalization_states_t *state) +{ + uchar_t *s; + int sz; + int saved_sz; + size_t i; + size_t j; + size_t k; + size_t l; + uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; + uchar_t disp[U8_MAX_CHARS_A_SEQ]; + uchar_t start[U8_MAX_CHARS_A_SEQ]; + uchar_t u8t[U8_MB_CUR_MAX]; + uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t tc; + size_t last; + size_t saved_last; + uint32_t u1; + + /* + * Save the source string pointer which we will return a changed + * pointer if we do processing. + */ + s = *source; + + /* + * The following is a fallback for just in case callers are not + * checking the string boundaries before the calling. + */ + if (s >= slast) { + u8s[0] = '\0'; + + return (0); + } + + /* + * As the first thing, let's collect a character and do case + * conversion if necessary. + */ + + sz = u8_number_of_bytes[*s]; + + if (sz < 0) { + *errnum = EILSEQ; + + u8s[0] = *s++; + u8s[1] = '\0'; + + *source = s; + + return (1); + } + + if (sz == 1) { + if (is_it_toupper) + u8s[0] = U8_ASCII_TOUPPER(*s); + else if (is_it_tolower) + u8s[0] = U8_ASCII_TOLOWER(*s); + else + u8s[0] = *s; + s++; + u8s[1] = '\0'; + } else if ((s + sz) > slast) { + *errnum = EINVAL; + + for (i = 0; s < slast; ) + u8s[i++] = *s++; + u8s[i] = '\0'; + + *source = s; + + return (i); + } else { + if (is_it_toupper || is_it_tolower) { + i = do_case_conv(uv, u8s, s, sz, is_it_toupper); + s += sz; + sz = i; + } else { + for (i = 0; i < sz; ) + u8s[i++] = *s++; + u8s[i] = '\0'; + } + } + + /* + * And then canonical/compatibility decomposition followed by + * an optional canonical composition. Please be noted that + * canonical composition is done only when a decomposition is + * done. + */ + if (canonical_decomposition || compatibility_decomposition) { + if (sz == 1) { + *state = U8_STATE_START; + + saved_sz = 1; + + comb_class[0] = 0; + start[0] = 0; + disp[0] = 1; + + last = 1; + } else { + saved_sz = do_decomp(uv, u8s, u8s, sz, + canonical_decomposition, state); + + last = 0; + + for (i = 0; i < saved_sz; ) { + sz = u8_number_of_bytes[u8s[i]]; + + comb_class[last] = combining_class(uv, + u8s + i, sz); + start[last] = i; + disp[last] = sz; + + last++; + i += sz; + } + + /* + * Decomposition yields various Hangul related + * states but not on combining marks. We need to + * find out at here by checking on the last + * character. + */ + if (*state == U8_STATE_START) { + if (comb_class[last - 1]) + *state = U8_STATE_COMBINING_MARK; + } + } + + saved_last = last; + + while (s < slast) { + sz = u8_number_of_bytes[*s]; + + /* + * If this is an illegal character, an incomplete + * character, or an 7-bit ASCII Starter character, + * then we have collected a sequence; break and let + * the next call deal with the two cases. + * + * Note that this is okay only if you are using this + * function with a fixed length string, not on + * a buffer with multiple calls of one chunk at a time. + */ + if (sz <= 1) { + break; + } else if ((s + sz) > slast) { + break; + } else { + /* + * If the previous character was a Hangul Jamo + * and this character is a Hangul Jamo that + * can be conjoined, we collect the Jamo. + */ + if (*s == U8_HANGUL_JAMO_1ST_BYTE) { + U8_PUT_3BYTES_INTO_UTF32(u1, + *s, *(s + 1), *(s + 2)); + + if (U8_HANGUL_COMPOSABLE_L_V(*state, + u1)) { + i = 0; + *state = U8_STATE_HANGUL_LV; + goto COLLECT_A_HANGUL; + } + + if (U8_HANGUL_COMPOSABLE_LV_T(*state, + u1)) { + i = 0; + *state = U8_STATE_HANGUL_LVT; + goto COLLECT_A_HANGUL; + } + } + + /* + * Regardless of whatever it was, if this is + * a Starter, we don't collect the character + * since that's a new start and we will deal + * with it at the next time. + */ + i = combining_class(uv, s, sz); + if (i == U8_COMBINING_CLASS_STARTER) + break; + + /* + * We know the current character is a combining + * mark. If the previous character wasn't + * a Starter (not Hangul) or a combining mark, + * then, we don't collect this combining mark. + */ + if (*state != U8_STATE_START && + *state != U8_STATE_COMBINING_MARK) + break; + + *state = U8_STATE_COMBINING_MARK; +COLLECT_A_HANGUL: + /* + * If we collected a Starter and combining + * marks up to 30, i.e., total 31 characters, + * then, we terminate this degenerately long + * combining sequence with a U+034F COMBINING + * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in + * UTF-8 and turn this into a Stream-Safe + * Text. This will be extremely rare but + * possible. + * + * The following will also guarantee that + * we are not writing more than 32 characters + * plus a NULL at u8s[]. + */ + if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { +TURN_STREAM_SAFE: + *state = U8_STATE_START; + comb_class[last] = 0; + start[last] = saved_sz; + disp[last] = 2; + last++; + + u8s[saved_sz++] = 0xCD; + u8s[saved_sz++] = 0x8F; + + break; + } + + /* + * Some combining marks also do decompose into + * another combining mark or marks. + */ + if (*state == U8_STATE_COMBINING_MARK) { + k = last; + l = sz; + i = do_decomp(uv, uts, s, sz, + canonical_decomposition, state); + for (j = 0; j < i; ) { + sz = u8_number_of_bytes[uts[j]]; + + comb_class[last] = + combining_class(uv, + uts + j, sz); + start[last] = saved_sz + j; + disp[last] = sz; + + last++; + if (last >= + U8_UPPER_LIMIT_IN_A_SEQ) { + last = k; + goto TURN_STREAM_SAFE; + } + j += sz; + } + + *state = U8_STATE_COMBINING_MARK; + sz = i; + s += l; + + for (i = 0; i < sz; i++) + u8s[saved_sz++] = uts[i]; + } else { + comb_class[last] = i; + start[last] = saved_sz; + disp[last] = sz; + last++; + + for (i = 0; i < sz; i++) + u8s[saved_sz++] = *s++; + } + + /* + * If this is U+0345 COMBINING GREEK + * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., + * iota subscript, and need to be converted to + * uppercase letter, convert it to U+0399 GREEK + * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), + * i.e., convert to capital adscript form as + * specified in the Unicode standard. + * + * This is the only special case of (ambiguous) + * case conversion at combining marks and + * probably the standard will never have + * anything similar like this in future. + */ + if (is_it_toupper && sz >= 2 && + u8s[saved_sz - 2] == 0xCD && + u8s[saved_sz - 1] == 0x85) { + u8s[saved_sz - 2] = 0xCE; + u8s[saved_sz - 1] = 0x99; + } + } + } + + /* + * Let's try to ensure a canonical ordering for the collected + * combining marks. We do this only if we have collected + * at least one more non-Starter. (The decomposition mapping + * data tables have fully (and recursively) expanded and + * canonically ordered decompositions.) + * + * The U8_SWAP_COMB_MARKS() convenience macro has some + * assumptions and we are meeting the assumptions. + */ + last--; + if (last >= saved_last) { + for (i = 0; i < last; i++) + for (j = last; j > i; j--) + if (comb_class[j] && + comb_class[j - 1] > comb_class[j]) { + U8_SWAP_COMB_MARKS(j - 1, j); + } + } + + *source = s; + + if (! canonical_composition) { + u8s[saved_sz] = '\0'; + return (saved_sz); + } + + /* + * Now do the canonical composition. Note that we do this + * only after a canonical or compatibility decomposition to + * finish up NFC or NFKC. + */ + sz = do_composition(uv, u8s, comb_class, start, disp, last, + &s, slast); + } + + *source = s; + + return ((size_t)sz); +} + +/* + * The do_norm_compare() function does string comparion based on Unicode + * simple case mappings and Unicode Normalization definitions. + * + * It does so by collecting a sequence of character at a time and comparing + * the collected sequences from the strings. + * + * The meanings on the return values are the same as the usual strcmp(). + */ +static int +do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, + int flag, int *errnum) +{ + int result; + size_t sz1; + size_t sz2; + uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t *s1last; + uchar_t *s2last; + boolean_t is_it_toupper; + boolean_t is_it_tolower; + boolean_t canonical_decomposition; + boolean_t compatibility_decomposition; + boolean_t canonical_composition; + u8_normalization_states_t state; + + s1last = s1 + n1; + s2last = s2 + n2; + + is_it_toupper = flag & U8_TEXTPREP_TOUPPER; + is_it_tolower = flag & U8_TEXTPREP_TOLOWER; + canonical_decomposition = flag & U8_CANON_DECOMP; + compatibility_decomposition = flag & U8_COMPAT_DECOMP; + canonical_composition = flag & U8_CANON_COMP; + + while (s1 < s1last && s2 < s2last) { + /* + * If the current character is a 7-bit ASCII and the last + * character, or, if the current character and the next + * character are both some 7-bit ASCII characters then + * we treat the current character as a sequence. + * + * In any other cases, we need to call collect_a_seq(). + */ + + if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || + ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { + if (is_it_toupper) + u8s1[0] = U8_ASCII_TOUPPER(*s1); + else if (is_it_tolower) + u8s1[0] = U8_ASCII_TOLOWER(*s1); + else + u8s1[0] = *s1; + u8s1[1] = '\0'; + sz1 = 1; + s1++; + } else { + state = U8_STATE_START; + sz1 = collect_a_seq(uv, u8s1, &s1, s1last, + is_it_toupper, is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, errnum, &state); + } + + if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || + ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { + if (is_it_toupper) + u8s2[0] = U8_ASCII_TOUPPER(*s2); + else if (is_it_tolower) + u8s2[0] = U8_ASCII_TOLOWER(*s2); + else + u8s2[0] = *s2; + u8s2[1] = '\0'; + sz2 = 1; + s2++; + } else { + state = U8_STATE_START; + sz2 = collect_a_seq(uv, u8s2, &s2, s2last, + is_it_toupper, is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, errnum, &state); + } + + /* + * Now compare the two characters. If they are the same, + * we move on to the next character sequences. + */ + if (sz1 == 1 && sz2 == 1) { + if (*u8s1 > *u8s2) + return (1); + if (*u8s1 < *u8s2) + return (-1); + } else { + result = strcmp((const char *)u8s1, (const char *)u8s2); + if (result != 0) + return (result); + } + } + + /* + * We compared until the end of either or both strings. + * + * If we reached to or went over the ends for the both, that means + * they are the same. + * + * If we reached only one end, that means the other string has + * something which then can be used to determine the return value. + */ + if (s1 >= s1last) { + if (s2 >= s2last) + return (0); + return (-1); + } + return (1); +} + +/* + * The u8_strcmp() function compares two UTF-8 strings quite similar to + * the strcmp(). For the comparison, however, Unicode Normalization specific + * equivalency and Unicode simple case conversion mappings based equivalency + * can be requested and checked against. + */ +int +u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, + int *errnum) +{ + int f; + size_t n1; + size_t n2; + + *errnum = 0; + + /* + * Check on the requested Unicode version, case conversion, and + * normalization flag values. + */ + + if (uv > U8_UNICODE_LATEST) { + *errnum = ERANGE; + uv = U8_UNICODE_LATEST; + } + + if (flag == 0) { + flag = U8_STRCMP_CS; + } else { + f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | + U8_STRCMP_CI_LOWER); + if (f == 0) { + flag |= U8_STRCMP_CS; + } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && + f != U8_STRCMP_CI_LOWER) { + *errnum = EBADF; + flag = U8_STRCMP_CS; + } + + f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); + if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && + f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { + *errnum = EBADF; + flag = U8_STRCMP_CS; + } + } + + if (flag == U8_STRCMP_CS) { + return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); + } + + n1 = strlen(s1); + n2 = strlen(s2); + if (n != 0) { + if (n < n1) + n1 = n; + if (n < n2) + n2 = n; + } + + /* + * Simple case conversion can be done much faster and so we do + * them separately here. + */ + if (flag == U8_STRCMP_CI_UPPER) { + return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, + n1, n2, B_TRUE, errnum)); + } else if (flag == U8_STRCMP_CI_LOWER) { + return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, + n1, n2, B_FALSE, errnum)); + } + + return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, + flag, errnum)); +} + +size_t +u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, + int flag, size_t unicode_version, int *errnum) +{ + int f; + int sz; + uchar_t *ib; + uchar_t *ibtail; + uchar_t *ob; + uchar_t *obtail; + boolean_t do_not_ignore_null; + boolean_t do_not_ignore_invalid; + boolean_t is_it_toupper; + boolean_t is_it_tolower; + boolean_t canonical_decomposition; + boolean_t compatibility_decomposition; + boolean_t canonical_composition; + size_t ret_val; + size_t i; + size_t j; + uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; + u8_normalization_states_t state; + + if (unicode_version > U8_UNICODE_LATEST) { + *errnum = ERANGE; + return ((size_t)-1); + } + + f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); + if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { + *errnum = EBADF; + return ((size_t)-1); + } + + f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); + if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && + f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { + *errnum = EBADF; + return ((size_t)-1); + } + + if (inarray == NULL || *inlen == 0) + return (0); + + if (outarray == NULL) { + *errnum = E2BIG; + return ((size_t)-1); + } + + ib = (uchar_t *)inarray; + ob = (uchar_t *)outarray; + ibtail = ib + *inlen; + obtail = ob + *outlen; + + do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); + do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); + is_it_toupper = flag & U8_TEXTPREP_TOUPPER; + is_it_tolower = flag & U8_TEXTPREP_TOLOWER; + + ret_val = 0; + + /* + * If we don't have a normalization flag set, we do the simple case + * conversion based text preparation separately below. Text + * preparation involving Normalization will be done in the false task + * block, again, separately since it will take much more time and + * resource than doing simple case conversions. + */ + if (f == 0) { + while (ib < ibtail) { + if (*ib == '\0' && do_not_ignore_null) + break; + + sz = u8_number_of_bytes[*ib]; + + if (sz < 0) { + if (do_not_ignore_invalid) { + *errnum = EILSEQ; + ret_val = (size_t)-1; + break; + } + + sz = 1; + ret_val++; + } + + if (sz == 1) { + if (ob >= obtail) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + if (is_it_toupper) + *ob = U8_ASCII_TOUPPER(*ib); + else if (is_it_tolower) + *ob = U8_ASCII_TOLOWER(*ib); + else + *ob = *ib; + ib++; + ob++; + } else if ((ib + sz) > ibtail) { + if (do_not_ignore_invalid) { + *errnum = EINVAL; + ret_val = (size_t)-1; + break; + } + + if ((obtail - ob) < (ibtail - ib)) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + /* + * We treat the remaining incomplete character + * bytes as a character. + */ + ret_val++; + + while (ib < ibtail) + *ob++ = *ib++; + } else { + if (is_it_toupper || is_it_tolower) { + i = do_case_conv(unicode_version, u8s, + ib, sz, is_it_toupper); + + if ((obtail - ob) < i) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + ib += sz; + + for (sz = 0; sz < i; sz++) + *ob++ = u8s[sz]; + } else { + if ((obtail - ob) < sz) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + for (i = 0; i < sz; i++) + *ob++ = *ib++; + } + } + } + } else { + canonical_decomposition = flag & U8_CANON_DECOMP; + compatibility_decomposition = flag & U8_COMPAT_DECOMP; + canonical_composition = flag & U8_CANON_COMP; + + while (ib < ibtail) { + if (*ib == '\0' && do_not_ignore_null) + break; + + /* + * If the current character is a 7-bit ASCII + * character and it is the last character, or, + * if the current character is a 7-bit ASCII + * character and the next character is also a 7-bit + * ASCII character, then, we copy over this + * character without going through collect_a_seq(). + * + * In any other cases, we need to look further with + * the collect_a_seq() function. + */ + if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || + ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { + if (ob >= obtail) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + if (is_it_toupper) + *ob = U8_ASCII_TOUPPER(*ib); + else if (is_it_tolower) + *ob = U8_ASCII_TOLOWER(*ib); + else + *ob = *ib; + ib++; + ob++; + } else { + *errnum = 0; + state = U8_STATE_START; + + j = collect_a_seq(unicode_version, u8s, + &ib, ibtail, + is_it_toupper, + is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, + errnum, &state); + + if (*errnum && do_not_ignore_invalid) { + ret_val = (size_t)-1; + break; + } + + if ((obtail - ob) < j) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + for (i = 0; i < j; i++) + *ob++ = u8s[i]; + } + } + } + + *inlen = ibtail - ib; + *outlen = obtail - ob; + + return (ret_val); +}