1331 lines
43 KiB
C
1331 lines
43 KiB
C
/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
|
|
|
|
/*
|
|
* This file is part of The Croco Library
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of version 2.1 of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
|
* USA
|
|
*
|
|
* Author: Dodji Seketeli
|
|
* See COPYRIGHTS file for copyright information.
|
|
*/
|
|
|
|
#include "cr-utils.h"
|
|
#include "cr-string.h"
|
|
|
|
/**
|
|
*@file:
|
|
*Some misc utility functions used
|
|
*in the libcroco.
|
|
*Note that troughout this file I will
|
|
*refer to the CSS SPECIFICATIONS DOCUMENTATION
|
|
*written by the w3c guys. You can find that document
|
|
*at http://www.w3.org/TR/REC-CSS2/ .
|
|
*/
|
|
|
|
/****************************
|
|
*Encoding transformations and
|
|
*encoding helpers
|
|
****************************/
|
|
|
|
/*
|
|
*Here is the correspondence between the ucs-4 charactere codes
|
|
*and there matching utf-8 encoding pattern as described by RFC 2279:
|
|
*
|
|
*UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
|
*------------------ -----------------------------
|
|
*0000 0000-0000 007F 0xxxxxxx
|
|
*0000 0080-0000 07FF 110xxxxx 10xxxxxx
|
|
*0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
|
*0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
|
|
*/
|
|
|
|
/**
|
|
*Given an utf8 string buffer, calculates
|
|
*the length of this string if it was encoded
|
|
*in ucs4.
|
|
*@param a_in_start a pointer to the beginning of
|
|
*the input utf8 string.
|
|
*@param a_in_end a pointre to the end of the input
|
|
*utf8 string (points to the last byte of the buffer)
|
|
*@param a_len out parameter the calculated length.
|
|
*@return CR_OK upon successful completion, an error code
|
|
*otherwise.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
|
|
const guchar * a_in_end, gulong * a_len)
|
|
{
|
|
guchar *byte_ptr = NULL;
|
|
gint len = 0;
|
|
|
|
/*
|
|
*to store the final decoded
|
|
*unicode char
|
|
*/
|
|
guint c = 0;
|
|
|
|
g_return_val_if_fail (a_in_start && a_in_end && a_len,
|
|
CR_BAD_PARAM_ERROR);
|
|
*a_len = 0;
|
|
|
|
for (byte_ptr = (guchar *) a_in_start;
|
|
byte_ptr <= a_in_end; byte_ptr++) {
|
|
gint nb_bytes_2_decode = 0;
|
|
|
|
if (*byte_ptr <= 0x7F) {
|
|
/*
|
|
*7 bits long char
|
|
*encoded over 1 byte:
|
|
* 0xxx xxxx
|
|
*/
|
|
c = *byte_ptr;
|
|
nb_bytes_2_decode = 1;
|
|
|
|
} else if ((*byte_ptr & 0xE0) == 0xC0) {
|
|
/*
|
|
*up to 11 bits long char.
|
|
*encoded over 2 bytes:
|
|
*110x xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 0x1F;
|
|
nb_bytes_2_decode = 2;
|
|
|
|
} else if ((*byte_ptr & 0xF0) == 0xE0) {
|
|
/*
|
|
*up to 16 bit long char
|
|
*encoded over 3 bytes:
|
|
*1110 xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 0x0F;
|
|
nb_bytes_2_decode = 3;
|
|
|
|
} else if ((*byte_ptr & 0xF8) == 0xF0) {
|
|
/*
|
|
*up to 21 bits long char
|
|
*encoded over 4 bytes:
|
|
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 0x7;
|
|
nb_bytes_2_decode = 4;
|
|
|
|
} else if ((*byte_ptr & 0xFC) == 0xF8) {
|
|
/*
|
|
*up to 26 bits long char
|
|
*encoded over 5 bytes.
|
|
*1111 10xx 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 3;
|
|
nb_bytes_2_decode = 5;
|
|
|
|
} else if ((*byte_ptr & 0xFE) == 0xFC) {
|
|
/*
|
|
*up to 31 bits long char
|
|
*encoded over 6 bytes:
|
|
*1111 110x 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 1;
|
|
nb_bytes_2_decode = 6;
|
|
|
|
} else {
|
|
/*
|
|
*BAD ENCODING
|
|
*/
|
|
return CR_ENCODING_ERROR;
|
|
}
|
|
|
|
/*
|
|
*Go and decode the remaining byte(s)
|
|
*(if any) to get the current character.
|
|
*/
|
|
for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
|
|
/*decode the next byte */
|
|
byte_ptr++;
|
|
|
|
/*byte pattern must be: 10xx xxxx */
|
|
if ((*byte_ptr & 0xC0) != 0x80) {
|
|
return CR_ENCODING_ERROR;
|
|
}
|
|
|
|
c = (c << 6) | (*byte_ptr & 0x3F);
|
|
}
|
|
|
|
len++;
|
|
}
|
|
|
|
*a_len = len;
|
|
|
|
return CR_OK;
|
|
}
|
|
|
|
/**
|
|
*Given an ucs4 string, this function
|
|
*returns the size (in bytes) this string
|
|
*would have occupied if it was encoded in utf-8.
|
|
*@param a_in_start a pointer to the beginning of the input
|
|
*buffer.
|
|
*@param a_in_end a pointer to the end of the input buffer.
|
|
*@param a_len out parameter. The computed length.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
|
|
const guint32 * a_in_end, gulong * a_len)
|
|
{
|
|
gint len = 0;
|
|
guint32 *char_ptr = NULL;
|
|
|
|
g_return_val_if_fail (a_in_start && a_in_end && a_len,
|
|
CR_BAD_PARAM_ERROR);
|
|
|
|
for (char_ptr = (guint32 *) a_in_start;
|
|
char_ptr <= a_in_end; char_ptr++) {
|
|
if (*char_ptr <= 0x7F) {
|
|
/*the utf-8 char would take 1 byte */
|
|
len += 1;
|
|
} else if (*char_ptr <= 0x7FF) {
|
|
/*the utf-8 char would take 2 bytes */
|
|
len += 2;
|
|
} else if (*char_ptr <= 0xFFFF) {
|
|
len += 3;
|
|
} else if (*char_ptr <= 0x1FFFFF) {
|
|
len += 4;
|
|
} else if (*char_ptr <= 0x3FFFFFF) {
|
|
len += 5;
|
|
} else if (*char_ptr <= 0x7FFFFFFF) {
|
|
len += 6;
|
|
}
|
|
}
|
|
|
|
*a_len = len;
|
|
return CR_OK;
|
|
}
|
|
|
|
/**
|
|
*Given an ucsA string, this function
|
|
*returns the size (in bytes) this string
|
|
*would have occupied if it was encoded in utf-8.
|
|
*@param a_in_start a pointer to the beginning of the input
|
|
*buffer.
|
|
*@param a_in_end a pointer to the end of the input buffer.
|
|
*@param a_len out parameter. The computed length.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
|
|
const guchar * a_in_end, gulong * a_len)
|
|
{
|
|
gint len = 0;
|
|
guchar *char_ptr = NULL;
|
|
|
|
g_return_val_if_fail (a_in_start && a_in_end && a_len,
|
|
CR_BAD_PARAM_ERROR);
|
|
|
|
for (char_ptr = (guchar *) a_in_start;
|
|
char_ptr <= a_in_end; char_ptr++) {
|
|
if (*char_ptr <= 0x7F) {
|
|
/*the utf-8 char would take 1 byte */
|
|
len += 1;
|
|
} else {
|
|
/*the utf-8 char would take 2 bytes */
|
|
len += 2;
|
|
}
|
|
}
|
|
|
|
*a_len = len;
|
|
return CR_OK;
|
|
}
|
|
|
|
/**
|
|
*Converts an utf8 buffer into an ucs4 buffer.
|
|
*
|
|
*@param a_in the input utf8 buffer to convert.
|
|
*@param a_in_len in/out parameter. The size of the
|
|
*input buffer to convert. After return, this parameter contains
|
|
*the actual number of bytes consumed.
|
|
*@param a_out the output converted ucs4 buffer. Must be allocated by
|
|
*the caller.
|
|
*@param a_out_len in/out parameter. The size of the output buffer.
|
|
*If this size is actually smaller than the real needed size, the function
|
|
*just converts what it can and returns a success status. After return,
|
|
*this param points to the actual number of characters decoded.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_utf8_to_ucs4 (const guchar * a_in,
|
|
gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
|
|
{
|
|
gulong in_len = 0,
|
|
out_len = 0,
|
|
in_index = 0,
|
|
out_index = 0;
|
|
enum CRStatus status = CR_OK;
|
|
|
|
/*
|
|
*to store the final decoded
|
|
*unicode char
|
|
*/
|
|
guint c = 0;
|
|
|
|
g_return_val_if_fail (a_in && a_in_len
|
|
&& a_out && a_out_len, CR_BAD_PARAM_ERROR);
|
|
|
|
if (*a_in_len < 1) {
|
|
status = CR_OK;
|
|
goto end;
|
|
}
|
|
|
|
in_len = *a_in_len;
|
|
out_len = *a_out_len;
|
|
|
|
for (in_index = 0, out_index = 0;
|
|
(in_index < in_len) && (out_index < out_len);
|
|
in_index++, out_index++) {
|
|
gint nb_bytes_2_decode = 0;
|
|
|
|
if (a_in[in_index] <= 0x7F) {
|
|
/*
|
|
*7 bits long char
|
|
*encoded over 1 byte:
|
|
* 0xxx xxxx
|
|
*/
|
|
c = a_in[in_index];
|
|
nb_bytes_2_decode = 1;
|
|
|
|
} else if ((a_in[in_index] & 0xE0) == 0xC0) {
|
|
/*
|
|
*up to 11 bits long char.
|
|
*encoded over 2 bytes:
|
|
*110x xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 0x1F;
|
|
nb_bytes_2_decode = 2;
|
|
|
|
} else if ((a_in[in_index] & 0xF0) == 0xE0) {
|
|
/*
|
|
*up to 16 bit long char
|
|
*encoded over 3 bytes:
|
|
*1110 xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 0x0F;
|
|
nb_bytes_2_decode = 3;
|
|
|
|
} else if ((a_in[in_index] & 0xF8) == 0xF0) {
|
|
/*
|
|
*up to 21 bits long char
|
|
*encoded over 4 bytes:
|
|
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 0x7;
|
|
nb_bytes_2_decode = 4;
|
|
|
|
} else if ((a_in[in_index] & 0xFC) == 0xF8) {
|
|
/*
|
|
*up to 26 bits long char
|
|
*encoded over 5 bytes.
|
|
*1111 10xx 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 3;
|
|
nb_bytes_2_decode = 5;
|
|
|
|
} else if ((a_in[in_index] & 0xFE) == 0xFC) {
|
|
/*
|
|
*up to 31 bits long char
|
|
*encoded over 6 bytes:
|
|
*1111 110x 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 1;
|
|
nb_bytes_2_decode = 6;
|
|
|
|
} else {
|
|
/*BAD ENCODING */
|
|
goto end;
|
|
}
|
|
|
|
/*
|
|
*Go and decode the remaining byte(s)
|
|
*(if any) to get the current character.
|
|
*/
|
|
for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
|
|
/*decode the next byte */
|
|
in_index++;
|
|
|
|
/*byte pattern must be: 10xx xxxx */
|
|
if ((a_in[in_index] & 0xC0) != 0x80) {
|
|
goto end;
|
|
}
|
|
|
|
c = (c << 6) | (a_in[in_index] & 0x3F);
|
|
}
|
|
|
|
/*
|
|
*The decoded ucs4 char is now
|
|
*in c.
|
|
*/
|
|
|
|
/************************
|
|
*Some security tests
|
|
***********************/
|
|
|
|
/*be sure c is a char */
|
|
if (c == 0xFFFF || c == 0xFFFE)
|
|
goto end;
|
|
|
|
/*be sure c is inferior to the max ucs4 char value */
|
|
if (c > 0x10FFFF)
|
|
goto end;
|
|
|
|
/*
|
|
*c must be less than UTF16 "lower surrogate begin"
|
|
*or higher than UTF16 "High surrogate end"
|
|
*/
|
|
if (c >= 0xD800 && c <= 0xDFFF)
|
|
goto end;
|
|
|
|
/*Avoid characters that equals zero */
|
|
if (c == 0)
|
|
goto end;
|
|
|
|
a_out[out_index] = c;
|
|
}
|
|
|
|
end:
|
|
*a_out_len = out_index + 1;
|
|
*a_in_len = in_index + 1;
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
*Reads a character from an utf8 buffer.
|
|
*Actually decode the next character code (unicode character code)
|
|
*and returns it.
|
|
*@param a_in the starting address of the utf8 buffer.
|
|
*@param a_in_len the length of the utf8 buffer.
|
|
*@param a_out output parameter. The resulting read char.
|
|
*@param a_consumed the number of the bytes consumed to
|
|
*decode the returned character code.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_read_char_from_utf8_buf (const guchar * a_in,
|
|
gulong a_in_len,
|
|
guint32 * a_out, gulong * a_consumed)
|
|
{
|
|
gulong in_index = 0,
|
|
nb_bytes_2_decode = 0;
|
|
enum CRStatus status = CR_OK;
|
|
|
|
/*
|
|
*to store the final decoded
|
|
*unicode char
|
|
*/
|
|
guint32 c = 0;
|
|
|
|
g_return_val_if_fail (a_in && a_out && a_out
|
|
&& a_consumed, CR_BAD_PARAM_ERROR);
|
|
|
|
if (a_in_len < 1) {
|
|
status = CR_OK;
|
|
goto end;
|
|
}
|
|
|
|
if (*a_in <= 0x7F) {
|
|
/*
|
|
*7 bits long char
|
|
*encoded over 1 byte:
|
|
* 0xxx xxxx
|
|
*/
|
|
c = *a_in;
|
|
nb_bytes_2_decode = 1;
|
|
|
|
} else if ((*a_in & 0xE0) == 0xC0) {
|
|
/*
|
|
*up to 11 bits long char.
|
|
*encoded over 2 bytes:
|
|
*110x xxxx 10xx xxxx
|
|
*/
|
|
c = *a_in & 0x1F;
|
|
nb_bytes_2_decode = 2;
|
|
|
|
} else if ((*a_in & 0xF0) == 0xE0) {
|
|
/*
|
|
*up to 16 bit long char
|
|
*encoded over 3 bytes:
|
|
*1110 xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *a_in & 0x0F;
|
|
nb_bytes_2_decode = 3;
|
|
|
|
} else if ((*a_in & 0xF8) == 0xF0) {
|
|
/*
|
|
*up to 21 bits long char
|
|
*encoded over 4 bytes:
|
|
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *a_in & 0x7;
|
|
nb_bytes_2_decode = 4;
|
|
|
|
} else if ((*a_in & 0xFC) == 0xF8) {
|
|
/*
|
|
*up to 26 bits long char
|
|
*encoded over 5 bytes.
|
|
*1111 10xx 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *a_in & 3;
|
|
nb_bytes_2_decode = 5;
|
|
|
|
} else if ((*a_in & 0xFE) == 0xFC) {
|
|
/*
|
|
*up to 31 bits long char
|
|
*encoded over 6 bytes:
|
|
*1111 110x 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *a_in & 1;
|
|
nb_bytes_2_decode = 6;
|
|
|
|
} else {
|
|
/*BAD ENCODING */
|
|
goto end;
|
|
}
|
|
|
|
if (nb_bytes_2_decode > a_in_len) {
|
|
status = CR_END_OF_INPUT_ERROR;
|
|
goto end;
|
|
}
|
|
|
|
/*
|
|
*Go and decode the remaining byte(s)
|
|
*(if any) to get the current character.
|
|
*/
|
|
for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
|
|
/*byte pattern must be: 10xx xxxx */
|
|
if ((a_in[in_index] & 0xC0) != 0x80) {
|
|
goto end;
|
|
}
|
|
|
|
c = (c << 6) | (a_in[in_index] & 0x3F);
|
|
}
|
|
|
|
/*
|
|
*The decoded ucs4 char is now
|
|
*in c.
|
|
*/
|
|
|
|
/************************
|
|
*Some security tests
|
|
***********************/
|
|
|
|
/*be sure c is a char */
|
|
if (c == 0xFFFF || c == 0xFFFE)
|
|
goto end;
|
|
|
|
/*be sure c is inferior to the max ucs4 char value */
|
|
if (c > 0x10FFFF)
|
|
goto end;
|
|
|
|
/*
|
|
*c must be less than UTF16 "lower surrogate begin"
|
|
*or higher than UTF16 "High surrogate end"
|
|
*/
|
|
if (c >= 0xD800 && c <= 0xDFFF)
|
|
goto end;
|
|
|
|
/*Avoid characters that equals zero */
|
|
if (c == 0)
|
|
goto end;
|
|
|
|
*a_out = c;
|
|
|
|
end:
|
|
*a_consumed = nb_bytes_2_decode;
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
*
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
|
|
const guchar * a_in_end, gulong * a_len)
|
|
{
|
|
/*
|
|
*Note: this function can be made shorter
|
|
*but it considers all the cases of the utf8 encoding
|
|
*to ease further extensions ...
|
|
*/
|
|
|
|
guchar *byte_ptr = NULL;
|
|
gint len = 0;
|
|
|
|
/*
|
|
*to store the final decoded
|
|
*unicode char
|
|
*/
|
|
guint c = 0;
|
|
|
|
g_return_val_if_fail (a_in_start && a_in_end && a_len,
|
|
CR_BAD_PARAM_ERROR);
|
|
*a_len = 0;
|
|
|
|
for (byte_ptr = (guchar *) a_in_start;
|
|
byte_ptr <= a_in_end; byte_ptr++) {
|
|
gint nb_bytes_2_decode = 0;
|
|
|
|
if (*byte_ptr <= 0x7F) {
|
|
/*
|
|
*7 bits long char
|
|
*encoded over 1 byte:
|
|
* 0xxx xxxx
|
|
*/
|
|
c = *byte_ptr;
|
|
nb_bytes_2_decode = 1;
|
|
|
|
} else if ((*byte_ptr & 0xE0) == 0xC0) {
|
|
/*
|
|
*up to 11 bits long char.
|
|
*encoded over 2 bytes:
|
|
*110x xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 0x1F;
|
|
nb_bytes_2_decode = 2;
|
|
|
|
} else if ((*byte_ptr & 0xF0) == 0xE0) {
|
|
/*
|
|
*up to 16 bit long char
|
|
*encoded over 3 bytes:
|
|
*1110 xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 0x0F;
|
|
nb_bytes_2_decode = 3;
|
|
|
|
} else if ((*byte_ptr & 0xF8) == 0xF0) {
|
|
/*
|
|
*up to 21 bits long char
|
|
*encoded over 4 bytes:
|
|
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 0x7;
|
|
nb_bytes_2_decode = 4;
|
|
|
|
} else if ((*byte_ptr & 0xFC) == 0xF8) {
|
|
/*
|
|
*up to 26 bits long char
|
|
*encoded over 5 bytes.
|
|
*1111 10xx 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 3;
|
|
nb_bytes_2_decode = 5;
|
|
|
|
} else if ((*byte_ptr & 0xFE) == 0xFC) {
|
|
/*
|
|
*up to 31 bits long char
|
|
*encoded over 6 bytes:
|
|
*1111 110x 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = *byte_ptr & 1;
|
|
nb_bytes_2_decode = 6;
|
|
|
|
} else {
|
|
/*
|
|
*BAD ENCODING
|
|
*/
|
|
return CR_ENCODING_ERROR;
|
|
}
|
|
|
|
/*
|
|
*Go and decode the remaining byte(s)
|
|
*(if any) to get the current character.
|
|
*/
|
|
for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
|
|
/*decode the next byte */
|
|
byte_ptr++;
|
|
|
|
/*byte pattern must be: 10xx xxxx */
|
|
if ((*byte_ptr & 0xC0) != 0x80) {
|
|
return CR_ENCODING_ERROR;
|
|
}
|
|
|
|
c = (c << 6) | (*byte_ptr & 0x3F);
|
|
}
|
|
|
|
/*
|
|
*The decoded ucs4 char is now
|
|
*in c.
|
|
*/
|
|
|
|
if (c <= 0xFF) { /*Add other conditions to support
|
|
*other char sets (ucs2, ucs3, ucs4).
|
|
*/
|
|
len++;
|
|
} else {
|
|
/*the char is too long to fit
|
|
*into the supposed charset len.
|
|
*/
|
|
return CR_ENCODING_ERROR;
|
|
}
|
|
}
|
|
|
|
*a_len = len;
|
|
|
|
return CR_OK;
|
|
}
|
|
|
|
/**
|
|
*Converts an utf8 string into an ucs4 string.
|
|
*@param a_in the input string to convert.
|
|
*@param a_in_len in/out parameter. The length of the input
|
|
*string. After return, points to the actual number of bytes
|
|
*consumed. This can be useful to debug the input stream in case
|
|
*of encoding error.
|
|
*@param a_out out parameter. Points to the output string. It is allocated
|
|
*by this function and must be freed by the caller.
|
|
*@param a_out_len out parameter. The length of the output string.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
|
|
gulong * a_in_len,
|
|
guint32 ** a_out, gulong * a_out_len)
|
|
{
|
|
enum CRStatus status = CR_OK;
|
|
|
|
g_return_val_if_fail (a_in && a_in_len
|
|
&& a_out && a_out_len, CR_BAD_PARAM_ERROR);
|
|
|
|
status = cr_utils_utf8_str_len_as_ucs4 (a_in,
|
|
&a_in[*a_in_len - 1],
|
|
a_out_len);
|
|
|
|
g_return_val_if_fail (status == CR_OK, status);
|
|
|
|
*a_out = g_malloc0 (*a_out_len * sizeof (guint32));
|
|
|
|
status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
*Converts an ucs4 buffer into an utf8 buffer.
|
|
*
|
|
*@param a_in the input ucs4 buffer to convert.
|
|
*@param a_in_len in/out parameter. The size of the
|
|
*input buffer to convert. After return, this parameter contains
|
|
*the actual number of characters consumed.
|
|
*@param a_out the output converted utf8 buffer. Must be allocated by
|
|
*the caller.
|
|
*@param a_out_len in/out parameter. The size of the output buffer.
|
|
*If this size is actually smaller than the real needed size, the function
|
|
*just converts what it can and returns a success status. After return,
|
|
*this param points to the actual number of bytes in the buffer.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_ucs4_to_utf8 (const guint32 * a_in,
|
|
gulong * a_in_len, guchar * a_out, gulong * a_out_len)
|
|
{
|
|
gulong in_len = 0,
|
|
in_index = 0,
|
|
out_index = 0;
|
|
enum CRStatus status = CR_OK;
|
|
|
|
g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
|
|
CR_BAD_PARAM_ERROR);
|
|
|
|
if (*a_in_len < 1) {
|
|
status = CR_OK;
|
|
goto end;
|
|
}
|
|
|
|
in_len = *a_in_len;
|
|
|
|
for (in_index = 0; in_index < in_len; in_index++) {
|
|
/*
|
|
*FIXME: return whenever we encounter forbidden char values.
|
|
*/
|
|
|
|
if (a_in[in_index] <= 0x7F) {
|
|
a_out[out_index] = a_in[in_index];
|
|
out_index++;
|
|
} else if (a_in[in_index] <= 0x7FF) {
|
|
a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
|
|
a_out[out_index + 1] =
|
|
(0x80 | (a_in[in_index] & 0x3F));
|
|
out_index += 2;
|
|
} else if (a_in[in_index] <= 0xFFFF) {
|
|
a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
|
|
a_out[out_index + 1] =
|
|
(0x80 | ((a_in[in_index] >> 6) & 0x3F));
|
|
a_out[out_index + 2] =
|
|
(0x80 | (a_in[in_index] & 0x3F));
|
|
out_index += 3;
|
|
} else if (a_in[in_index] <= 0x1FFFFF) {
|
|
a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
|
|
a_out[out_index + 1]
|
|
= (0x80 | ((a_in[in_index] >> 12) & 0x3F));
|
|
a_out[out_index + 2]
|
|
= (0x80 | ((a_in[in_index] >> 6) & 0x3F));
|
|
a_out[out_index + 3]
|
|
= (0x80 | (a_in[in_index] & 0x3F));
|
|
out_index += 4;
|
|
} else if (a_in[in_index] <= 0x3FFFFFF) {
|
|
a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
|
|
a_out[out_index + 1] =
|
|
(0x80 | (a_in[in_index] >> 18));
|
|
a_out[out_index + 2]
|
|
= (0x80 | ((a_in[in_index] >> 12) & 0x3F));
|
|
a_out[out_index + 3]
|
|
= (0x80 | ((a_in[in_index] >> 6) & 0x3F));
|
|
a_out[out_index + 4]
|
|
= (0x80 | (a_in[in_index] & 0x3F));
|
|
out_index += 5;
|
|
} else if (a_in[in_index] <= 0x7FFFFFFF) {
|
|
a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
|
|
a_out[out_index + 1] =
|
|
(0x80 | (a_in[in_index] >> 24));
|
|
a_out[out_index + 2]
|
|
= (0x80 | ((a_in[in_index] >> 18) & 0x3F));
|
|
a_out[out_index + 3]
|
|
= (0x80 | ((a_in[in_index] >> 12) & 0x3F));
|
|
a_out[out_index + 4]
|
|
= (0x80 | ((a_in[in_index] >> 6) & 0x3F));
|
|
a_out[out_index + 4]
|
|
= (0x80 | (a_in[in_index] & 0x3F));
|
|
out_index += 6;
|
|
} else {
|
|
status = CR_ENCODING_ERROR;
|
|
goto end;
|
|
}
|
|
} /*end for */
|
|
|
|
end:
|
|
*a_in_len = in_index + 1;
|
|
*a_out_len = out_index + 1;
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
*Converts an ucs4 string into an utf8 string.
|
|
*@param a_in the input string to convert.
|
|
*@param a_in_len in/out parameter. The length of the input
|
|
*string. After return, points to the actual number of characters
|
|
*consumed. This can be useful to debug the input string in case
|
|
*of encoding error.
|
|
*@param a_out out parameter. Points to the output string. It is allocated
|
|
*by this function and must be freed by the caller.
|
|
*@param a_out_len out parameter. The length (in bytes) of the output string.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
|
|
gulong * a_in_len,
|
|
guchar ** a_out, gulong * a_out_len)
|
|
{
|
|
enum CRStatus status = CR_OK;
|
|
|
|
g_return_val_if_fail (a_in && a_in_len && a_out
|
|
&& a_out_len, CR_BAD_PARAM_ERROR);
|
|
|
|
status = cr_utils_ucs4_str_len_as_utf8 (a_in,
|
|
&a_in[*a_out_len - 1],
|
|
a_out_len);
|
|
|
|
g_return_val_if_fail (status == CR_OK, status);
|
|
|
|
status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
*Converts an ucs1 buffer into an utf8 buffer.
|
|
*The caller must know the size of the resulting buffer and
|
|
*allocate it prior to calling this function.
|
|
*
|
|
*@param a_in the input ucs1 buffer.
|
|
*
|
|
*@param a_in_len in/out parameter. The length of the input buffer.
|
|
*After return, points to the number of bytes actually consumed even
|
|
*in case of encoding error.
|
|
*
|
|
*@param a_out out parameter. The output utf8 converted buffer.
|
|
*
|
|
*@param a_out_len in/out parameter. The size of the output buffer.
|
|
*If the output buffer size is shorter than the actual needed size,
|
|
*this function just convert what it can.
|
|
*
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_ucs1_to_utf8 (const guchar * a_in,
|
|
gulong * a_in_len, guchar * a_out, gulong * a_out_len)
|
|
{
|
|
gulong out_index = 0,
|
|
in_index = 0,
|
|
in_len = 0,
|
|
out_len = 0;
|
|
enum CRStatus status = CR_OK;
|
|
|
|
g_return_val_if_fail (a_in && a_in_len
|
|
&& a_out_len,
|
|
CR_BAD_PARAM_ERROR);
|
|
|
|
if (*a_in_len == 0) {
|
|
*a_out_len = 0 ;
|
|
return status;
|
|
}
|
|
g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
|
|
|
|
in_len = *a_in_len;
|
|
out_len = *a_out_len;
|
|
|
|
for (in_index = 0, out_index = 0;
|
|
(in_index < in_len) && (out_index < out_len); in_index++) {
|
|
/*
|
|
*FIXME: return whenever we encounter forbidden char values.
|
|
*/
|
|
|
|
if (a_in[in_index] <= 0x7F) {
|
|
a_out[out_index] = a_in[in_index];
|
|
out_index++;
|
|
} else {
|
|
a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
|
|
a_out[out_index + 1] =
|
|
(0x80 | (a_in[in_index] & 0x3F));
|
|
out_index += 2;
|
|
}
|
|
} /*end for */
|
|
|
|
*a_in_len = in_index;
|
|
*a_out_len = out_index;
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
*Converts an ucs1 string into an utf8 string.
|
|
*@param a_in_start the beginning of the input string to convert.
|
|
*@param a_in_end the end of the input string to convert.
|
|
*@param a_out out parameter. The converted string.
|
|
*@param a_out out parameter. The length of the converted string.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
|
|
gulong * a_in_len,
|
|
guchar ** a_out, gulong * a_out_len)
|
|
{
|
|
gulong out_len = 0;
|
|
enum CRStatus status = CR_OK;
|
|
|
|
g_return_val_if_fail (a_in && a_in_len && a_out
|
|
&& a_out_len, CR_BAD_PARAM_ERROR);
|
|
|
|
if (*a_in_len < 1) {
|
|
*a_out_len = 0;
|
|
*a_out = NULL;
|
|
return CR_OK;
|
|
}
|
|
|
|
status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
|
|
&out_len);
|
|
|
|
g_return_val_if_fail (status == CR_OK, status);
|
|
|
|
*a_out = g_malloc0 (out_len);
|
|
|
|
status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
|
|
|
|
*a_out_len = out_len;
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
*Converts an utf8 buffer into an ucs1 buffer.
|
|
*The caller must know the size of the resulting
|
|
*converted buffer, and allocated it prior to calling this
|
|
*function.
|
|
*
|
|
*@param a_in the input utf8 buffer to convert.
|
|
*
|
|
*@param a_in_len in/out parameter. The size of the input utf8 buffer.
|
|
*After return, points to the number of bytes consumed
|
|
*by the function even in case of encoding error.
|
|
*
|
|
*@param a_out out parameter. Points to the resulting buffer.
|
|
*Must be allocated by the caller. If the size of a_out is shorter
|
|
*than its required size, this function converts what it can and return
|
|
*a successful status.
|
|
*
|
|
*@param a_out_len in/out parameter. The size of the output buffer.
|
|
*After return, points to the number of bytes consumed even in case of
|
|
*encoding error.
|
|
*
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_utf8_to_ucs1 (const guchar * a_in,
|
|
gulong * a_in_len, guchar * a_out, gulong * a_out_len)
|
|
{
|
|
gulong in_index = 0,
|
|
out_index = 0,
|
|
in_len = 0,
|
|
out_len = 0;
|
|
enum CRStatus status = CR_OK;
|
|
|
|
/*
|
|
*to store the final decoded
|
|
*unicode char
|
|
*/
|
|
guint32 c = 0;
|
|
|
|
g_return_val_if_fail (a_in && a_in_len
|
|
&& a_out && a_out_len, CR_BAD_PARAM_ERROR);
|
|
|
|
if (*a_in_len < 1) {
|
|
goto end;
|
|
}
|
|
|
|
in_len = *a_in_len;
|
|
out_len = *a_out_len;
|
|
|
|
for (in_index = 0, out_index = 0;
|
|
(in_index < in_len) && (out_index < out_len);
|
|
in_index++, out_index++) {
|
|
gint nb_bytes_2_decode = 0;
|
|
|
|
if (a_in[in_index] <= 0x7F) {
|
|
/*
|
|
*7 bits long char
|
|
*encoded over 1 byte:
|
|
* 0xxx xxxx
|
|
*/
|
|
c = a_in[in_index];
|
|
nb_bytes_2_decode = 1;
|
|
|
|
} else if ((a_in[in_index] & 0xE0) == 0xC0) {
|
|
/*
|
|
*up to 11 bits long char.
|
|
*encoded over 2 bytes:
|
|
*110x xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 0x1F;
|
|
nb_bytes_2_decode = 2;
|
|
|
|
} else if ((a_in[in_index] & 0xF0) == 0xE0) {
|
|
/*
|
|
*up to 16 bit long char
|
|
*encoded over 3 bytes:
|
|
*1110 xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 0x0F;
|
|
nb_bytes_2_decode = 3;
|
|
|
|
} else if ((a_in[in_index] & 0xF8) == 0xF0) {
|
|
/*
|
|
*up to 21 bits long char
|
|
*encoded over 4 bytes:
|
|
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 0x7;
|
|
nb_bytes_2_decode = 4;
|
|
|
|
} else if ((a_in[in_index] & 0xFC) == 0xF8) {
|
|
/*
|
|
*up to 26 bits long char
|
|
*encoded over 5 bytes.
|
|
*1111 10xx 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 3;
|
|
nb_bytes_2_decode = 5;
|
|
|
|
} else if ((a_in[in_index] & 0xFE) == 0xFC) {
|
|
/*
|
|
*up to 31 bits long char
|
|
*encoded over 6 bytes:
|
|
*1111 110x 10xx xxxx 10xx xxxx
|
|
*10xx xxxx 10xx xxxx 10xx xxxx
|
|
*/
|
|
c = a_in[in_index] & 1;
|
|
nb_bytes_2_decode = 6;
|
|
|
|
} else {
|
|
/*BAD ENCODING */
|
|
status = CR_ENCODING_ERROR;
|
|
goto end;
|
|
}
|
|
|
|
/*
|
|
*Go and decode the remaining byte(s)
|
|
*(if any) to get the current character.
|
|
*/
|
|
if (in_index + nb_bytes_2_decode - 1 >= in_len) {
|
|
goto end;
|
|
}
|
|
|
|
for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
|
|
/*decode the next byte */
|
|
in_index++;
|
|
|
|
/*byte pattern must be: 10xx xxxx */
|
|
if ((a_in[in_index] & 0xC0) != 0x80) {
|
|
status = CR_ENCODING_ERROR;
|
|
goto end;
|
|
}
|
|
|
|
c = (c << 6) | (a_in[in_index] & 0x3F);
|
|
}
|
|
|
|
/*
|
|
*The decoded ucs4 char is now
|
|
*in c.
|
|
*/
|
|
|
|
if (c > 0xFF) {
|
|
status = CR_ENCODING_ERROR;
|
|
goto end;
|
|
}
|
|
|
|
a_out[out_index] = c;
|
|
}
|
|
|
|
end:
|
|
*a_out_len = out_index;
|
|
*a_in_len = in_index;
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
*Converts an utf8 buffer into an
|
|
*ucs1 buffer.
|
|
*@param a_in_start the start of the input buffer.
|
|
*@param a_in_end the end of the input buffer.
|
|
*@param a_out out parameter. The resulting converted ucs4 buffer.
|
|
*Must be freed by the caller.
|
|
*@param a_out_len out parameter. The length of the converted buffer.
|
|
*@return CR_OK upon successful completion, an error code otherwise.
|
|
*Note that out parameters are valid if and only if this function
|
|
*returns CR_OK.
|
|
*/
|
|
enum CRStatus
|
|
cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
|
|
gulong * a_in_len,
|
|
guchar ** a_out, gulong * a_out_len)
|
|
{
|
|
enum CRStatus status = CR_OK;
|
|
|
|
g_return_val_if_fail (a_in && a_in_len
|
|
&& a_out && a_out_len, CR_BAD_PARAM_ERROR);
|
|
|
|
if (*a_in_len < 1) {
|
|
*a_out_len = 0;
|
|
*a_out = NULL;
|
|
return CR_OK;
|
|
}
|
|
|
|
status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
|
|
a_out_len);
|
|
|
|
g_return_val_if_fail (status == CR_OK, status);
|
|
|
|
*a_out = g_malloc0 (*a_out_len * sizeof (guint32));
|
|
|
|
status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
|
|
return status;
|
|
}
|
|
|
|
/*****************************************
|
|
*CSS basic types identification utilities
|
|
*****************************************/
|
|
|
|
/**
|
|
*Returns TRUE if a_char is a white space as
|
|
*defined in the css spec in chap 4.1.1.
|
|
*
|
|
*white-space ::= ' '| \t|\r|\n|\f
|
|
*
|
|
*@param a_char the character to test.
|
|
*return TRUE if is a white space, false otherwise.
|
|
*/
|
|
gboolean
|
|
cr_utils_is_white_space (guint32 a_char)
|
|
{
|
|
switch (a_char) {
|
|
case ' ':
|
|
case '\t':
|
|
case '\r':
|
|
case '\n':
|
|
case '\f':
|
|
return TRUE;
|
|
break;
|
|
default:
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
/**
|
|
*Returns true if the character is a newline
|
|
*as defined in the css spec in the chap 4.1.1.
|
|
*
|
|
*nl ::= \n|\r\n|\r|\f
|
|
*
|
|
*@param a_char the character to test.
|
|
*@return TRUE if the character is a newline, FALSE otherwise.
|
|
*/
|
|
gboolean
|
|
cr_utils_is_newline (guint32 a_char)
|
|
{
|
|
switch (a_char) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\f':
|
|
return TRUE;
|
|
break;
|
|
default:
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
/**
|
|
*returns TRUE if the char is part of an hexa num char:
|
|
*i.e hexa_char ::= [0-9A-F]
|
|
*/
|
|
gboolean
|
|
cr_utils_is_hexa_char (guint32 a_char)
|
|
{
|
|
if ((a_char >= '0' && a_char <= '9')
|
|
|| (a_char >= 'A' && a_char <= 'F')) {
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
/**
|
|
*Returns true if the character is a nonascii
|
|
*character (as defined in the css spec chap 4.1.1):
|
|
*
|
|
*nonascii ::= [^\0-\177]
|
|
*
|
|
*@param a_char the character to test.
|
|
*@return TRUE if the character is a nonascii char,
|
|
*FALSE otherwise.
|
|
*/
|
|
gboolean
|
|
cr_utils_is_nonascii (guint32 a_char)
|
|
{
|
|
if (a_char <= 177) {
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/**
|
|
*Dumps a character a_nb times on a file.
|
|
*@param a_char the char to dump
|
|
*@param a_fp the destination file pointer
|
|
*@param a_nb the number of times a_char is to be dumped.
|
|
*/
|
|
void
|
|
cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
|
|
{
|
|
glong i = 0;
|
|
|
|
for (i = 0; i < a_nb; i++) {
|
|
fprintf (a_fp, "%c", a_char);
|
|
}
|
|
}
|
|
|
|
void
|
|
cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
|
|
{
|
|
glong i = 0;
|
|
|
|
g_return_if_fail (a_string);
|
|
|
|
for (i = 0; i < a_nb; i++) {
|
|
g_string_append_printf (a_string, "%c", a_char);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*Duplicates a list of GString instances.
|
|
*@return the duplicated list of GString instances or NULL if
|
|
*something bad happened.
|
|
*@param a_list_of_strings the list of strings to be duplicated.
|
|
*/
|
|
GList *
|
|
cr_utils_dup_glist_of_string (GList const * a_list_of_strings)
|
|
{
|
|
GList const *cur = NULL;
|
|
GList *result = NULL;
|
|
|
|
g_return_val_if_fail (a_list_of_strings, NULL);
|
|
|
|
for (cur = a_list_of_strings; cur; cur = cur->next) {
|
|
GString *str = NULL;
|
|
|
|
str = g_string_new_len (((GString *) cur->data)->str,
|
|
((GString *) cur->data)->len);
|
|
if (str)
|
|
result = g_list_append (result, str);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
*Duplicate a GList where the GList::data is a CRString.
|
|
*@param a_list_of_strings the list to duplicate
|
|
*@return the duplicated list, or NULL if something bad
|
|
*happened.
|
|
*/
|
|
GList *
|
|
cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings)
|
|
{
|
|
GList const *cur = NULL;
|
|
GList *result = NULL;
|
|
|
|
g_return_val_if_fail (a_list_of_strings, NULL);
|
|
|
|
for (cur = a_list_of_strings; cur; cur = cur->next) {
|
|
CRString *str = NULL;
|
|
|
|
str = cr_string_dup ((CRString const *) cur->data) ;
|
|
if (str)
|
|
result = g_list_append (result, str);
|
|
}
|
|
|
|
return result;
|
|
}
|