Files
opennurbs/opennurbs_unicode.cpp
2018-09-10 17:39:40 -07:00

2937 lines
70 KiB
C++
Raw Blame History

/* $NoKeywords: $ */
/*
//
// Copyright (c) 1993-2012 Robert McNeel & Associates. All rights reserved.
// OpenNURBS, Rhinoceros, and Rhino3D are registered trademarks of Robert
// McNeel & Associates.
//
// THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY.
// ALL IMPLIED WARRANTIES OF FITNESS FOR ANY PARTICULAR PURPOSE AND OF
// MERCHANTABILITY ARE HEREBY DISCLAIMED.
//
// For complete openNURBS copyright information see <http://www.opennurbs.org>.
//
////////////////////////////////////////////////////////////////
*/
#include "opennurbs.h"
#if !defined(ON_COMPILING_OPENNURBS)
// This check is included in all opennurbs source .c and .cpp files to insure
// ON_COMPILING_OPENNURBS is defined when opennurbs source is compiled.
// When opennurbs source is being compiled, ON_COMPILING_OPENNURBS is defined
// and the opennurbs .h files alter what is declared and how it is declared.
#error ON_COMPILING_OPENNURBS must be defined when compiling opennurbs
#endif
int ON_IsValidUnicodeCodePoint(ON__UINT32 u)
{
return (u < 0xD800 || (u >= 0xE000 && u <= 0x10FFFF));
}
int ON_IsValidUTF32Value(
ON__UINT32 c
)
{
return (c < 0xD800 || (c >= 0xE000 && c <= 0x10FFFF));
}
int ON_IsValidSingleElementUTF16Value(ON__UINT32 c)
{
return ((c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF));
}
int ON_IsValidUTF16Singleton(ON__UINT32 c)
{
return ((c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF));
}
enum ON_UnicodeEncoding ON_UnicodeNativeCPU_UTF16()
{
return (ON::endian::little_endian== ON::Endian()) ? ON_UTF_16LE : ON_UTF_16BE;
}
enum ON_UnicodeEncoding ON_UnicodeNativeCPU_UTF32()
{
return (ON::endian::little_endian== ON::Endian()) ? ON_UTF_32LE : ON_UTF_32BE;
}
int ON_IsValidSingleByteUTF8CharValue(
char c
)
{
return (c >= 0 && c <= 0x7F);
}
int ON_IsValidUTF8SingletonChar(
char c
)
{
return (c >= 0 && c <= 0x7F);
}
int ON_IsValidSingleElementUTF8Value(
ON__UINT32 c
)
{
return (c <= 0x7F);
}
int ON_IsValidUTF8Singleton(
ON__UINT32 c
)
{
return (c <= 0x7FU);
}
int ON_IsValidUTF16SurrogatePair(
unsigned int w1,
unsigned int w2
)
{
return ( w1 >= 0xD800U && w1 < 0xDC00 && w2 >= 0xDC00 && w2 < 0xE000 );
}
int ON_IsValidSingleElementWideCharValue(
wchar_t w
)
{
#pragma ON_PRAGMA_WARNING_PUSH
// warning C4127: conditional expression is constant
#pragma ON_PRAGMA_WARNING_DISABLE_MSC( 4127 )
if (1 == sizeof(w))
return ON_IsValidSingleElementUTF8Value((ON__UINT32)w);
if (2 == sizeof(w))
return ON_IsValidSingleElementUTF16Value((ON__UINT32)w);
return ON_IsValidUTF32Value((ON__UINT32)w);
#pragma ON_PRAGMA_WARNING_POP
}
enum ON_UnicodeEncoding ON_IsUTFByteOrderMark(
const void* buffer,
size_t sizeof_buffer
)
{
if ( 0 != buffer && sizeof_buffer >= 2 )
{
const unsigned char* b = static_cast<const unsigned char*>(buffer);
if ( 0 == b[0] )
{
if ( sizeof_buffer >= 4 && 0 == b[1] && 0xFE == b[2] && 0xFF == b[3] )
return ON_UTF_32BE;
}
else if ( 0xEF == b[0] )
{
if ( sizeof_buffer >= 3 && 0xBB == b[1] && 0xBF == b[2] )
return ON_UTF_8;
}
else if ( 0xFE == b[0] )
{
if ( 0xFF == b[1] )
return ON_UTF_16BE;
}
else if ( 0xFF == b[0] && 0xFE == b[1] )
{
return ( sizeof_buffer >= 4 && 0 == b[2] && 0 == b[3] )
? ON_UTF_32LE
: ON_UTF_16LE;
}
}
return ON_UTF_unset;
}
unsigned int ON_UTFSizeofByteOrderMark(
ON_UnicodeEncoding e
)
{
unsigned int sizeof_bom;
switch (e)
{
case ON_UTF_8:
sizeof_bom = 3;
break;
case ON_UTF_16:
case ON_UTF_16BE:
case ON_UTF_16LE:
sizeof_bom = 2;
break;
case ON_UTF_32:
case ON_UTF_32BE:
case ON_UTF_32LE:
sizeof_bom = 4;
break;
default:
sizeof_bom = 0;
break;
}
return sizeof_bom;
}
static int ON_IsUTF8ByteOrderMark(
const char* sUTF8,
int sUTF8_count
)
{
if ( 0 == sUTF8 )
return 0;
if ( -1 != sUTF8_count || sUTF8_count < 3 )
return 0;
return (0xEF == (unsigned char)(sUTF8[0]) && 0xBB == (unsigned char)(sUTF8[1]) && 0xBF == (unsigned char)(sUTF8[2]));
}
int ON_EncodeUTF8( ON__UINT32 u, char sUTF8[6] )
{
ON__UINT32 c;
if ( u <= 0x7F )
{
// 1 byte UTF8 encoding: 0xxxxxxx (7 bits of u)
sUTF8[0] = (char)u;
return 1;
}
if ( u <= 0x7FF )
{
// 2 byte UTF8 encoding: 110xxxxx, 10xxxxxx (11 bits of u)
c = (u / 0x40); // c = 000xxxxx
c |= 0xC0; // |= 11000000
sUTF8[0] = (char)c;
c = (u & 0x3F);
c |= 0x80;
sUTF8[1] = (char)c;
return 2;
}
if ( u <= 0xFFFF )
{
// 3 byte UTF8 encoding: 1110xxxx, 10xxxxxx, 10xxxxxx (16 bits of u)
c = (u / 0x1000); // c = 0000xxxx
c |= 0xE0; // |= 11100000
sUTF8[0] = (char)c;
c = ((u & 0xFFF) / 0x40);
c |= 0x80;
sUTF8[1] = (char)c;
c = u & 0x3F;
c |= 0x80;
sUTF8[2] = (char)c;
return 3;
}
if ( u <= 0x1FFFFF )
{
// (maximum valid unicode codepoint is 0x10FFFF)
// 4 byte UTF8 encoding: 11110xxx, 10xxxxxx, 10xxxxxx, 10xxxxxx (21 bits of u)
// Note: 0x10FFFF is the maximum valid unicode code point.
// For u > 0x10FFFF and u <= 0x1FFFFF, this calculation encodes the low 21 bits of u.
c = (u / 0x40000); // c = 00000xxx
c |= 0xF0; // |= 11110000
sUTF8[0] = (char)c;
c = ((u & 0x3FFFF)/0x1000);
c |= 0x80;
sUTF8[1] = (char)c;
c = ((u & 0xFFF) / 0x40);
c |= 0x80;
sUTF8[2] = (char)c;
c = u & 0x3F;
c |= 0x80;
sUTF8[3] = (char)c;
return 4;
}
if ( u <= 0x3FFFFFF )
{
// 5 byte encoding: 111110xx, 10xxxxxx, 10xxxxxx, 10xxxxxx, 10xxxxxx (26 bits of u)
// Note: 0x10FFFF is the maximum valid unicode code point.
c = (u / 0x1000000); // c = 000000xx
c |= 0xF8; // |= 11111000
sUTF8[0] = (char)c;
c = ((u & 0xFFFFFF)/0x40000);
c |= 0x80;
sUTF8[1] = (char)c;
c = ((u & 0x3FFFF)/0x1000);
c |= 0x80;
sUTF8[2] = (char)c;
c = ((u & 0xFFF) / 0x40);
c |= 0x80;
sUTF8[3] = (char)c;
c = u & 0x3F;
c |= 0x80;
sUTF8[4] = (char)c;
return 5;
}
if ( u <= 0x7FFFFFFF )
{
// 6 byte encoding: 1111110x, 10xxxxxx, 10xxxxxx, 10xxxxxx, 10xxxxxx, 10xxxxxx (31 bits of u)
// Note: 0x10FFFF is the maximum valid unicode code point.
c = (u / 0x40000000); // c = 00000000x
c |= 0xFC; // |= 11111100
sUTF8[0] = (char)c;
c = ((u & 0x3FFFFFFF)/0x1000000);
c |= 0x80;
sUTF8[1] = (char)c;
c = ((u & 0xFFFFFF)/0x40000);
c |= 0x80;
sUTF8[2] = (char)c;
c = ((u & 0x3FFFF)/0x1000);
c |= 0x80;
sUTF8[3] = (char)c;
c = ((u & 0xFFF) / 0x40);
c |= 0x80;
sUTF8[4] = (char)c;
c = u & 0x3F;
c |= 0x80;
sUTF8[5] = (char)c;
return 6;
}
return 0;
}
static int ON_DecodeUTF8Helper(
const char* sUTF8,
int sUTF8_count,
ON__UINT32* value,
unsigned int* error_status
)
{
#define INPUT_BUFFER_TOO_SHORT 16
#define INVALID_CONTINUATION_VALUE 16
#define OVERLONG_ENCODING 8
ON__UINT32 u;
char c;
c = sUTF8[0];
if ( 0 == (0x80 & c) )
{
// 1 byte ASCII encoding: 0xxxxxxx
*value = c;
return 1;
}
if ( 0xC0 == ( 0xE0 & c) )
{
// 2 byte character encoding: 10xxxxxx, 10xxxxxx
if ( sUTF8_count < 2 )
{
*error_status |= INPUT_BUFFER_TOO_SHORT; // input buffer too short
return 0;
}
u = (0x1F & c);
c = sUTF8[1];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
if ( u <= 0x7F )
{
*error_status |= OVERLONG_ENCODING; // overlong 2 byte character encoding
}
*value = u;
return 2;
}
if ( 0xE0 == ( 0xF0 & c) )
{
// 3 byte character encoding: 110xxxxx, 10xxxxxx, 10xxxxxx
if ( sUTF8_count < 3 )
{
*error_status |= INPUT_BUFFER_TOO_SHORT; // input buffer too short
return 0;
}
u = (0x0F & c);
c = sUTF8[1];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[2];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
if ( u <= 0x7FF )
{
*error_status |= OVERLONG_ENCODING; // overlong 3 byte character encoding
}
*value = u;
return 3;
}
if ( 0xF0 == ( 0xF8 & c) )
{
// 4 byte character encoding: 11110xxx, 10xxxxxx, 10xxxxxx, 10xxxxxx
if ( sUTF8_count < 4 )
{
*error_status |= INPUT_BUFFER_TOO_SHORT; // input buffer too short
return 0;
}
u = (0x07 & c);
c = sUTF8[1];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[2];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[3];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
if ( u <= 0xFFFF )
{
*error_status |= OVERLONG_ENCODING; // overlong 4 byte character encoding
}
*value = u;
return 4;
}
if ( 0xF8 == ( 0xFC & c) )
{
// 5 byte character encoding: 111110xx, 10xxxxxx, 10xxxxxx, 10xxxxxx, 10xxxxxx
if ( sUTF8_count < 5 )
{
*error_status |= INPUT_BUFFER_TOO_SHORT; // input buffer too short
return 0;
}
u = (0x03 & c);
c = sUTF8[1];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[2];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[3];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[4];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
if ( u <= 0x1FFFFF )
{
*error_status |= OVERLONG_ENCODING; // overlong 5 byte character encoding
}
*value = u;
return 5;
}
if ( 0xFC == ( 0xFE & c) )
{
// 6 byte character encoding: 110xxxxx, 10xxxxxx, 10xxxxxx, 10xxxxxx, 10xxxxxx, 10xxxxxx
if ( sUTF8_count < 6 )
{
*error_status |= INPUT_BUFFER_TOO_SHORT; // input buffer too short
return 0;
}
u = (0x01 & c);
c = sUTF8[1];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[2];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[3];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[4];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
c = sUTF8[5];
if ( 0x80 != ( 0xC0 & c) )
{
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 continuation value
return 0;
}
u *= 64;
u |= (0x3F & c);
if ( u <= 0x3FFFFFF )
{
*error_status |= OVERLONG_ENCODING; // overlong 6 byte character encoding
}
*value = u;
return 6;
}
*error_status |= INVALID_CONTINUATION_VALUE; // invalid UTF=8 start value
return 0;
#undef INPUT_BUFFER_TOO_SHORT
#undef INVALID_CONTINUATION_VALUE
#undef OVERLONG_ENCODING
}
int ON_DecodeUTF8(
const char* sUTF8,
int sUTF8_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
ON__UINT32 u0, u1;
int i0, i1;
unsigned int error_status;
ON__UINT16 sUTF16[2];
char c;
ON_UnicodeErrorParameters local_e = ON_UnicodeErrorParameters::MaskErrors;
if (nullptr == e)
e = &local_e;
if ( 0 == sUTF8 || sUTF8_count <= 0 || 0 == unicode_code_point )
{
if ( e )
e->m_error_status |= 1;
return 0;
}
// special cases for most common unicode values
// If any error conditions exist, then ON_DecodeUTF8Helper()
// is used.
if ( 0 == (0x80 & sUTF8[0]) )
{
*unicode_code_point = sUTF8[0];
return 1;
}
c = sUTF8[0];
if ( 0xC0 == ( 0xE0 & c) && sUTF8_count >= 2 )
{
// 2 byte character encoding: 10xxxxxx, 10xxxxxx
u0 = (0x1F & c);
c = sUTF8[1];
if ( 0x80 == ( 0xC0 & c) )
{
u0 *= 64;
u0 |= (0x3F & c);
if ( u0 > 0x7F )
{
*unicode_code_point = u0;
return 2;
}
}
}
else if ( 0xE0 == ( 0xF0 & c) && sUTF8_count >= 3 )
{
// 3 byte character encoding: 110xxxxx, 10xxxxxx, 10xxxxxx
u0 = (0x0F & c);
c = sUTF8[1];
if ( 0x80 == ( 0xC0 & c) )
{
u0 *= 64;
u0 |= (0x3F & c);
c = sUTF8[2];
if ( 0x80 == ( 0xC0 & c) )
{
u0 *= 64;
u0 |= (0x3F & c);
if ( u0 >= 0x0800 && (u0 <= 0xD800 || u0 >= 0xE000) )
{
*unicode_code_point = u0;
return 3;
}
}
}
}
else if ( 0xF0 == ( 0xF8 & c) && sUTF8_count >= 4 )
{
// 4 byte character encoding: 11110xxx, 10xxxxxx, 10xxxxxx, 10xxxxxx
u0 = (0x07 & c);
c = sUTF8[1];
if ( 0x80 == ( 0xC0 & c) )
{
u0 *= 64;
u0 |= (0x3F & c);
c = sUTF8[2];
if ( 0x80 == ( 0xC0 & c) )
{
u0 *= 64;
u0 |= (0x3F & c);
c = sUTF8[3];
if ( 0x80 == ( 0xC0 & c) )
{
u0 *= 64;
u0 |= (0x3F & c);
if ( u0 >= 0x010000 && u0 <= 0x10FFFF )
{
*unicode_code_point = u0;
return 4;
}
}
}
}
}
error_status = 0;
u0 = 0xFFFFFFFF;
i0 = ON_DecodeUTF8Helper(sUTF8,sUTF8_count,&u0,&error_status);
if ( i0 > 0 && 0 == error_status && (u0 < 0xD800 || (u0 >= 0xE000 && u0 <= 0x10FFFF) ) )
{
// valid UTF-8 multibyte encoding parsed
*unicode_code_point = u0;
return i0;
}
// handle errors
if ( 0 == e )
{
// no errors are masked.
return 0;
}
// report error condition
e->m_error_status |= error_status;
if ( error_status != (error_status & e->m_error_mask) )
{
// this error is not masked
return 0;
}
if ( i0 <= 0 )
{
i0 = 1;
if ( ON_IsValidUnicodeCodePoint(e->m_error_code_point) )
{
// skip to next UTF-8 start elemement
for ( /*empty for initializer*/; i0 < sUTF8_count; i0++ )
{
// Search for the next element of sUTF8[] that is the
// start of a UTF-8 encoding sequence.
c = sUTF8[i0];
if ( 0 == (0x80 & c) // ASCII 0 - 127
|| 0xC0 == ( 0xE0 & c) // 2 byte encoding first character
|| 0xE0 == ( 0xF0 & c) // 3 byte encoding first character
|| 0xF0 == ( 0xF8 & c) // 4 byte encoding first character
|| 0xF8 == ( 0xFC & c) // 5 byte encoding first character
|| 0xFC == ( 0xFE & c) // 6 byte encoding first character
)
{
// resume parsing at this character
break;
}
}
*unicode_code_point = e->m_error_code_point;
}
return i0;
}
if ( ON_IsValidUnicodeCodePoint(u0) && 8 == error_status )
{
// overlong UTF-8 multibyte encoding of valid unicode code point
*unicode_code_point = u0;
return i0;
}
if ( i0 < sUTF8_count
&& u0 >= 0xD800 && u0 <= 0xDBFF
&& (0 == error_status || 8 == error_status)
&& 0 != (4 & e->m_error_mask)
)
{
// See if a UFT-16 surrogate pair was incorrectly encoded
// as two consecutive UTF-8 sequences.
u1 = 0xFFFFFFFF;
i1 = ON_DecodeUTF8Helper(sUTF8+i0,sUTF8_count-i0,&u1,&error_status);
if ( i1 > 0 && (0 == error_status || 8 == error_status) )
{
error_status = 0;
sUTF16[0] = (ON__UINT16)u0;
sUTF16[1] = (ON__UINT16)u1;
u0 = 0xFFFFFFFF;
if ( 2 == ON_ConvertUTF16ToUTF32(false,sUTF16,2,&u0,1,&error_status,0,0,0)
&& 0 == error_status
&& ON_IsValidUnicodeCodePoint(u0)
)
{
*unicode_code_point = u0;
e->m_error_status |= 4;
return i0+i1;
}
}
}
if ( ON_IsValidUnicodeCodePoint(e->m_error_code_point) )
{
*unicode_code_point = e->m_error_code_point;
return i0;
}
return 0;
}
int ON_EncodeUTF16( ON__UINT32 unicode_code_point, ON__UINT16 sUTF16[2] )
{
// put the most comman case first
if ( unicode_code_point < 0xD800 )
{
// code point values U+0000 ... U+D7FF
// = UTF-16 values
sUTF16[0] = (ON__UINT16)unicode_code_point;
return 1;
}
if ( unicode_code_point < 0xE000 )
{
// 0xD800 ... 0xDFFF are invalid unicode code point values
return 0;
}
if ( unicode_code_point <= 0xFFFF )
{
// code point values U+E000 ... U+FFFF
// = UTF-16 values
sUTF16[0] = (ON__UINT16)unicode_code_point;
return 1;
}
if ( unicode_code_point <= 0x10FFFF )
{
// code point values U+10000 ... U+10FFFF
// = surrogate pair UTF-16 values
unicode_code_point -= 0x10000;
sUTF16[0] = (ON__UINT16)(0xD800 + (unicode_code_point / 0x400)); // high surrogate value (0xD800 ... 0xDBFF)
sUTF16[1] = (ON__UINT16)(0xDC00 + (unicode_code_point & 0x3FF)); // low surrogate value (0xDC00 ... 0xDFFF)
return 2;
}
// 0x110000 ... 0xFFFFFFFF are invalid unicode code point values
return 0;
}
int ON_DecodeUTF16(
const ON__UINT16* sUTF16,
int sUTF16_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
ON__UINT32 uhi, ulo;
ON_UnicodeErrorParameters local_e = ON_UnicodeErrorParameters::MaskErrors;
if (nullptr == e)
e = &local_e;
if ( 0 == sUTF16 || sUTF16_count <= 0 || 0 == unicode_code_point )
{
if ( e )
e->m_error_status |= 1;
return 0;
}
// special case for most common UTF-16 single element values
if ( ( sUTF16[0] < 0xD800 ) || ( sUTF16[0] >= 0xE000 ) )
{
*unicode_code_point = sUTF16[0];
return 1;
}
if ( sUTF16_count >= 2 && sUTF16[0] < 0xDC00 && sUTF16[1] >= 0xDC00 && sUTF16[1] < 0xE000 )
{
// UTF-16 surrogate pair
uhi = sUTF16[0];
ulo = sUTF16[1];
*unicode_code_point = (uhi-0xD800)*0x400 + (ulo-0xDC00) + 0x10000;
return 2;
}
// handle errors
if ( 0 == e )
{
// no errors are masked.
return 0;
}
// report error condition
e->m_error_status |= 16;
if ( 16 != (16 & e->m_error_mask) || !ON_IsValidUnicodeCodePoint(e->m_error_code_point) )
{
// this error is not masked
return 0;
}
// Search for the next element of sUTF16[] that is a
// valid UTF-16 encoding sequence.
int i;
for ( i = 1; i < sUTF16_count; i++ )
{
if ( ( sUTF16[i] < 0xD800 ) || ( sUTF16[i] >= 0xE000 ) )
{
// valid single UTF-16 code unit
break;
}
if ( i+1 < sUTF16_count
&& sUTF16[i] >= 0xD800 && sUTF16[i] < 0xDC00
&& sUTF16[i+1] >= 0xDC00 && sUTF16[i+1] < 0xE000
)
{
// valid UTF-16 surrogate pair
break;
}
}
*unicode_code_point = e->m_error_code_point;
return i;
}
int ON_DecodeUTF16LE(
const ON__UINT16* sUTF16,
int sUTF16_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
#if defined(ON_LITTLE_ENDIAN)
return ON_DecodeUTF16(sUTF16,sUTF16_count,e,unicode_code_point);
#else
return ON_DecodeSwapByteUTF16(sUTF16,sUTF16_count,e,unicode_code_point);
#endif
}
int ON_DecodeUTF16BE(
const ON__UINT16* sUTF16,
int sUTF16_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
#if defined(ON_BIG_ENDIAN)
return ON_DecodeUTF16(sUTF16,sUTF16_count,e,unicode_code_point);
#else
return ON_DecodeSwapByteUTF16(sUTF16,sUTF16_count,e,unicode_code_point);
#endif
}
int ON_DecodeUTF32LE(
const ON__UINT32* sUTF32,
int sUTF32_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
#if defined(ON_LITTLE_ENDIAN)
return ON_DecodeUTF32(sUTF32,sUTF32_count,e,unicode_code_point);
#else
return ON_DecodeSwapByteUTF32(sUTF32,sUTF32_count,e,unicode_code_point);
#endif
}
int ON_DecodeUTF32BE(
const ON__UINT32* sUTF32,
int sUTF32_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
#if defined(ON_BIG_ENDIAN)
return ON_DecodeUTF32(sUTF32,sUTF32_count,e,unicode_code_point);
#else
return ON_DecodeSwapByteUTF32(sUTF32,sUTF32_count,e,unicode_code_point);
#endif
}
int ON_EncodeWideChar(
ON__UINT32 code_point,
size_t w_capacity,
wchar_t* w
)
{
int rc = 0;
if (nullptr != w && w_capacity > 0)
{
if (ON_IsValidUnicodeCodePoint(code_point))
{
#if 1 == ON_SIZEOF_WCHAR_T
char sUTF8[6];
rc = ON_EncodeUTF8(code_point, sUTF8);
if (rc > (int)w_capacity)
rc = 0;
switch (rc)
{
case 1:
w[0] = (wchar_t)sUTF8[0];
break;
case 2:
w[0] = (wchar_t)sUTF8[0];
w[1] = (wchar_t)sUTF8[1];
break;
case 3:
w[0] = (wchar_t)sUTF8[0];
w[1] = (wchar_t)sUTF8[1];
w[2] = (wchar_t)sUTF8[2];
break;
case 4:
w[0] = (wchar_t)sUTF8[0];
w[1] = (wchar_t)sUTF8[1];
w[2] = (wchar_t)sUTF8[2];
w[3] = (wchar_t)sUTF8[3];
break;
default:
rc = 0; break;
}
#elif 2 == ON_SIZEOF_WCHAR_T
ON__UINT16 sUTF16[2];
rc = ON_EncodeUTF16(code_point, sUTF16);
if (rc > (int)w_capacity)
rc = 0;
switch (rc)
{
case 1:
w[0] = (wchar_t)sUTF16[0];
break;
case 2:
w[0] = (wchar_t)sUTF16[0];
w[1] = (wchar_t)sUTF16[1];
break;
default:
rc = 0; break;
}
#elif 4 == ON_SIZEOF_WCHAR_T
if (w_capacity > 0)
{
w[0] = (wchar_t)code_point;
rc = 1;
}
#endif
}
if (rc >= 0 && rc < (int)w_capacity)
w[rc] = 0;
}
return rc;
}
int ON_DecodeWideChar(
const wchar_t* sWideChar,
int sWideChar_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
const ON_UnicodeEncoding widechar_encoding = ON_WCHAR_T_ENCODING;
int rc;
switch (widechar_encoding)
{
#if 1 == ON_SIZEOF_WCHAR_T
case ON_UTF_8:
rc = ON_DecodeUTF8((const char*)sWideChar,sWideChar_count,e,unicode_code_point);
break;
#elif 2 == ON_SIZEOF_WCHAR_T
case ON_UTF_16:
return ON_DecodeUTF16((const ON__UINT16*)sWideChar,sWideChar_count,e,unicode_code_point);
break;
case ON_UTF_16BE:
rc = ON_DecodeUTF16BE((const ON__UINT16*)sWideChar,sWideChar_count,e,unicode_code_point);
break;
case ON_UTF_16LE:
rc = ON_DecodeUTF16LE((const ON__UINT16*)sWideChar,sWideChar_count,e,unicode_code_point);
break;
#elif 4 == ON_SIZEOF_WCHAR_T
case ON_UTF_32:
rc = ON_DecodeUTF32((const ON__UINT32*)sWideChar,sWideChar_count,e,unicode_code_point);
break;
case ON_UTF_32BE:
rc = ON_DecodeUTF32BE((const ON__UINT32*)sWideChar,sWideChar_count,e,unicode_code_point);
break;
case ON_UTF_32LE:
rc = ON_DecodeUTF32LE((const ON__UINT32*)sWideChar,sWideChar_count,e,unicode_code_point);
break;
#endif
default:
rc = 0;
if ( e )
e->m_error_status |= 1;
break;
}
return rc;
}
int ON_DecodeSwapByteUTF16(
const ON__UINT16* sUTF16,
int sUTF16_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
int i;
ON__UINT32 uhi, ulo;
ON__UINT16 w0, w1;
const ON__UINT8* p;
ON__UINT8* p0;
ON__UINT8* p1;
ON_UnicodeErrorParameters local_e = ON_UnicodeErrorParameters::MaskErrors;
if (nullptr == e)
e = &local_e;
if ( 0 == sUTF16 || sUTF16_count <= 0 || 0 == unicode_code_point )
{
if ( e )
e->m_error_status |= 1;
return 0;
}
// special case for most common UTF-16 single element values
// w0 = byte swapped sUTF16[0]
p = (const ON__UINT8*)sUTF16;
p0 = (ON__UINT8*)&w0;
p0[1] = p[0];
p0[0] = p[1];
if ( ( w0 < 0xD800 ) || (w0 >= 0xE000 ) )
{
*unicode_code_point = w0;
return 1;
}
if ( sUTF16_count >= 2 && w0 < 0xDC00 )
{
// w1 = byte swapped sUTF16[1]
p1 = (ON__UINT8*)&w1;
p1[1] = p[2];
p1[0] = p[3];
if ( w1 >= 0xDC00 && w1 < 0xE000 )
{
// UTF-16 surrogate pair
uhi = w0;
ulo = w1;
*unicode_code_point = (uhi-0xD800)*0x400 + (ulo-0xDC00) + 0x10000;
return 2;
}
}
// handle errors
if ( 0 == e )
{
// no errors are masked.
return 0;
}
// report error condition
e->m_error_status |= 16;
if ( 16 != (16 & e->m_error_mask) || !ON_IsValidUnicodeCodePoint(e->m_error_code_point) )
{
// this error is not masked
return 0;
}
// Search for the next element of sUTF16[] that is a
// valid UTF-16 encoding sequence.
p1 = (ON__UINT8*)&w1;
p += sizeof(sUTF16[0]);
for ( i = 1; i < sUTF16_count; i++, p += sizeof(sUTF16[0]) )
{
// w0 = byte swapped sUTF16[i]
p0[1] = p[0];
p0[0] = p[1];
if ( ( w0 < 0xD800 ) || ( w0 >= 0xE000 ) )
{
// valid single UTF-16 code unit
break;
}
if ( i+1 < sUTF16_count && w0 >= 0xD800 && w0 < 0xDC00 )
{
// w1 = byte swapped sUTF16[i+1]
p1[1] = p[sizeof(sUTF16[0])];
p1[0] = p[sizeof(sUTF16[0])+1];
if ( w1 >= 0xDC00 && w1 < 0xE000 )
{
// valid UTF-16 surrogate pair
break;
}
}
}
*unicode_code_point = e->m_error_code_point;
return i;
}
int ON_ConvertUTF8ToUTF8(
int bTestByteOrder,
const char* sInputUTF8,
int sInputUTF8_count,
char* sOutputUTF8,
int sOutputUTF8_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const char** sNextInputUTF8
)
{
int i, j, k, output_count;
ON__UINT32 u;
char s[6];
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sInputUTF8_count && 0 != sInputUTF8 )
{
for ( sInputUTF8_count = 0; 0 != sInputUTF8[sInputUTF8_count]; sInputUTF8_count++)
{
// empty for body
}
}
if ( 0 == sInputUTF8 || sInputUTF8_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextInputUTF8 )
*sNextInputUTF8 = sInputUTF8;
return 0;
}
if ( 0 == sOutputUTF8_count )
{
sOutputUTF8 = 0;
sOutputUTF8_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sOutputUTF8 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextInputUTF8 )
*sNextInputUTF8 = sInputUTF8;
return 0;
}
if ( bTestByteOrder && ON_IsUTF8ByteOrderMark(sInputUTF8,sInputUTF8_count) )
{
// skip UTF-8 byte order element
sInputUTF8_count -= 3;
sInputUTF8 += 3;
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
for ( i = 0; i < sInputUTF8_count; i += j )
{
j = ON_DecodeUTF8(sInputUTF8+i,sInputUTF8_count-i,&e,&u);
if ( j <= 0 )
break;
k = ON_EncodeUTF8(u,s);
if ( 0 != sOutputUTF8 )
{
if ( output_count + k > sOutputUTF8_count )
{
e.m_error_status |= 2;
break;
}
memcpy(sOutputUTF8+output_count,s,k*sizeof(sOutputUTF8[0]));
}
output_count += k;
}
if ( 0 != sOutputUTF8 && output_count < sOutputUTF8_count)
sOutputUTF8[output_count] = 0;
if ( sNextInputUTF8 )
*sNextInputUTF8 = sInputUTF8+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
int ON_ConvertUTF8ToUTF16(
int bTestByteOrder,
const char* sUTF8,
int sUTF8_count,
ON__UINT16* sUTF16,
int sUTF16_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const char** sNextUTF8
)
{
int i, j, k, output_count;
ON__UINT32 u;
ON__UINT16 w[2];
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sUTF8_count && 0 != sUTF8 )
{
for ( sUTF8_count = 0; 0 != sUTF8[sUTF8_count]; sUTF8_count++)
{
// empty for body
}
}
if ( 0 == sUTF8 || sUTF8_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF8 )
*sNextUTF8 = sUTF8;
return 0;
}
if ( bTestByteOrder && ON_IsUTF8ByteOrderMark(sUTF8,sUTF8_count) )
{
// skip UTF-8 byte order element
sUTF8_count -= 3;
sUTF8 += 3;
}
if ( 0 == sUTF16_count )
{
sUTF16 = 0;
sUTF16_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sUTF16 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF8 )
*sNextUTF8 = sUTF8;
return 0;
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
for ( i = 0; i < sUTF8_count; i += j )
{
j = ON_DecodeUTF8(sUTF8+i,sUTF8_count-i,&e,&u);
if ( j <= 0 )
break;
k = ON_EncodeUTF16(u,w);
if ( 0 != sUTF16 )
{
if ( output_count + k > sUTF16_count )
{
e.m_error_status |= 2;
break;
}
sUTF16[output_count] = w[0];
if ( 2 == k )
sUTF16[output_count+1] = w[1];
}
output_count += k;
}
if ( 0 != sUTF16 && output_count < sUTF16_count)
sUTF16[output_count] = 0;
if ( sNextUTF8 )
*sNextUTF8 = sUTF8+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
int ON_ConvertUTF8ToUTF32(
int bTestByteOrder,
const char* sUTF8,
int sUTF8_count,
ON__UINT32* sUTF32,
int sUTF32_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const char** sNextUTF8
)
{
int i, j, output_count;
ON__UINT32 u;
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sUTF8_count && 0 != sUTF8 )
{
for ( sUTF8_count = 0; 0 != sUTF8[sUTF8_count]; sUTF8_count++)
{
// empty for body
}
}
if ( 0 == sUTF8 || sUTF8_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF8 )
*sNextUTF8 = sUTF8;
return 0;
}
if ( bTestByteOrder && ON_IsUTF8ByteOrderMark(sUTF8,sUTF8_count) )
{
// skip UTF-8 byte order element
sUTF8_count -= 3;
sUTF8 += 3;
}
if ( 0 == sUTF32_count )
{
sUTF32 = 0;
sUTF32_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sUTF32 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF8 )
*sNextUTF8 = sUTF8;
return 0;
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
for ( i = 0; i < sUTF8_count; i += j )
{
j = ON_DecodeUTF8(sUTF8+i,sUTF8_count-i,&e,&u);
if ( j <= 0 )
break;
if ( 0 != sUTF32 )
{
if ( output_count >= sUTF32_count )
{
e.m_error_status |= 2;
break;
}
sUTF32[output_count] = u;
}
output_count++;
}
if ( 0 != sUTF32 && output_count < sUTF32_count)
sUTF32[output_count] = 0;
if ( sNextUTF8 )
*sNextUTF8 = sUTF8+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
int ON_ConvertUTF16ToUTF8(
int bTestByteOrder,
const ON__UINT16* sUTF16,
int sUTF16_count,
char* sUTF8,
int sUTF8_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const ON__UINT16** sNextUTF16
)
{
int i, j, k, output_count, bSwapBytes;
ON__UINT32 u;
char s[6];
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sUTF16_count && 0 != sUTF16 )
{
for ( sUTF16_count = 0; 0 != sUTF16[sUTF16_count]; sUTF16_count++)
{
// empty for body
}
}
if ( 0 == sUTF16 || sUTF16_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16;
return 0;
}
if ( 0 == sUTF8_count )
{
sUTF8 = 0;
sUTF8_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sUTF8 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16;
return 0;
}
bSwapBytes = false;
if ( bTestByteOrder && sUTF16_count > 0 )
{
if ( 0xFEFF == sUTF16[0] )
{
// skip BOM
sUTF16_count--;
sUTF16++;
}
else if ( 0xFFFE == sUTF16[0] )
{
// skip BOM and swap bytes in rest of sUTF16
bSwapBytes = true;
sUTF16_count--;
sUTF16++;
}
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
if ( bSwapBytes )
{
for ( i = 0; i < sUTF16_count; i += j )
{
j = ON_DecodeSwapByteUTF16(sUTF16+i,sUTF16_count-i,&e,&u);
if ( j <= 0 )
break;
k = ON_EncodeUTF8(u,s);
if ( 0 != sUTF8 )
{
if ( output_count + k > sUTF8_count )
{
e.m_error_status |= 2;
break;
}
memcpy(sUTF8+output_count,s,k*sizeof(sUTF8[0]));
}
output_count += k;
}
}
else
{
for ( i = 0; i < sUTF16_count; i += j )
{
j = ON_DecodeUTF16(sUTF16+i,sUTF16_count-i,&e,&u);
if ( j <= 0 )
break;
k = ON_EncodeUTF8(u,s);
if ( 0 != sUTF8 )
{
if ( output_count + k > sUTF8_count )
{
e.m_error_status |= 2;
break;
}
memcpy(sUTF8+output_count,s,k*sizeof(sUTF8[0]));
}
output_count += k;
}
}
if ( 0 != sUTF8 && output_count < sUTF8_count)
sUTF8[output_count] = 0;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
int ON_ConvertUTF16ToUTF16(
int bTestByteOrder,
const ON__UINT16* sInputUTF16,
int sInputUTF16_count,
ON__UINT16* sOutputUTF16,
int sOutputUTF16_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const ON__UINT16** sNextInputUTF16
)
{
int i, j, k, output_count, bSwapBytes;
ON__UINT32 u;
ON__UINT16 s[2];
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sInputUTF16_count && 0 != sInputUTF16 )
{
for ( sInputUTF16_count = 0; 0 != sInputUTF16[sInputUTF16_count]; sInputUTF16_count++)
{
// empty for body
}
}
if ( 0 == sInputUTF16 || sInputUTF16_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextInputUTF16 )
*sNextInputUTF16 = sInputUTF16;
return 0;
}
if ( 0 == sOutputUTF16_count )
{
sOutputUTF16 = 0;
sOutputUTF16_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sOutputUTF16 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextInputUTF16 )
*sNextInputUTF16 = sInputUTF16;
return 0;
}
bSwapBytes = false;
if ( bTestByteOrder && sInputUTF16_count > 0 )
{
if ( 0xFEFF == sInputUTF16[0] )
{
// skip BOM
sInputUTF16_count--;
sInputUTF16++;
}
else if ( 0xFFFE == sInputUTF16[0] )
{
// skip BOM and swap bytes in rest of sInputUTF16
bSwapBytes = true;
sInputUTF16_count--;
sInputUTF16++;
}
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
if ( bSwapBytes )
{
for ( i = 0; i < sInputUTF16_count; i += j )
{
j = ON_DecodeSwapByteUTF16(sInputUTF16+i,sInputUTF16_count-i,&e,&u);
if ( j <= 0 )
break;
k = ON_EncodeUTF16(u,s);
if ( 0 != sOutputUTF16 )
{
if ( output_count + k > sOutputUTF16_count )
{
e.m_error_status |= 2;
break;
}
memcpy(sOutputUTF16+output_count,s,k*sizeof(sOutputUTF16[0]));
}
output_count += k;
}
}
else
{
for ( i = 0; i < sInputUTF16_count; i += j )
{
j = ON_DecodeUTF16(sInputUTF16+i,sInputUTF16_count-i,&e,&u);
if ( j <= 0 )
break;
k = ON_EncodeUTF16(u,s);
if ( 0 != sOutputUTF16 )
{
if ( output_count + k > sOutputUTF16_count )
{
e.m_error_status |= 2;
break;
}
memcpy(sOutputUTF16+output_count,s,k*sizeof(sOutputUTF16[0]));
}
output_count += k;
}
}
if ( 0 != sOutputUTF16 && output_count < sOutputUTF16_count)
sOutputUTF16[output_count] = 0;
if ( sNextInputUTF16 )
*sNextInputUTF16 = sInputUTF16+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
int ON_ConvertUTF16ToUTF32(
int bTestByteOrder,
const ON__UINT16* sUTF16,
int sUTF16_count,
unsigned int* sUTF32,
int sUTF32_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const ON__UINT16** sNextUTF16
)
{
int i, j, output_count, bSwapBytes;
ON__UINT32 u;
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sUTF16_count && 0 != sUTF16 )
{
for ( sUTF16_count = 0; 0 != sUTF16[sUTF16_count]; sUTF16_count++)
{
// empty for body
}
}
if ( 0 == sUTF16 || sUTF16_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16;
return 0;
}
if ( 0 == sUTF32_count )
{
sUTF32 = 0;
sUTF32_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sUTF32 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16;
return 0;
}
bSwapBytes = false;
if ( bTestByteOrder && sUTF16_count > 0 )
{
if ( 0xFEFF == sUTF16[0] )
{
// skip BOM
sUTF16_count--;
sUTF16++;
}
else if ( 0xFFFE == sUTF16[0] )
{
// skip BOM and swap bytes in rest of sUTF16
bSwapBytes = true;
sUTF16_count--;
sUTF16++;
}
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
if ( bSwapBytes )
{
for ( i = 0; i < sUTF16_count; i += j )
{
j = ON_DecodeSwapByteUTF16(sUTF16+i,sUTF16_count-i,&e,&u);
if ( j <= 0 )
break;
if ( 0 != sUTF32 )
{
if ( output_count >= sUTF32_count )
{
e.m_error_status |= 2;
break;
}
sUTF32[output_count] = u;
}
output_count++;
}
}
else
{
for ( i = 0; i < sUTF16_count; i += j )
{
j = ON_DecodeUTF16(sUTF16+i,sUTF16_count-i,&e,&u);
if ( j <= 0 )
break;
if ( 0 != sUTF32 )
{
if ( output_count >= sUTF32_count )
{
e.m_error_status |= 2;
break;
}
sUTF32[output_count] = u;
}
output_count++;
}
}
if ( 0 != sUTF32 && output_count < sUTF32_count)
sUTF32[output_count] = 0;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
static ON__UINT32 SwapBytes32(ON__UINT32 u)
{
ON__UINT8 b;
ON__UINT8* p = (ON__UINT8*)&u;
b = p[0]; p[0] = p[3]; p[3] = b;
b = p[1]; p[1] = p[2]; p[2] = b;
return u;
}
int ON_DecodeUTF32(
const ON__UINT32* sUTF32,
int sUTF32_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
ON__UINT32 uhi, ulo;
ON_UnicodeErrorParameters local_e = ON_UnicodeErrorParameters::MaskErrors;
if (nullptr == e)
e = &local_e;
if ( 0 == sUTF32 || sUTF32_count <= 0 || 0 == unicode_code_point )
{
e->m_error_status |= 1;
return 0;
}
// special case for most common UTF-16 single element values
if ( ( sUTF32[0] < 0xD800 ) || ( sUTF32[0] >= 0xE000 && sUTF32[0] <= 0x10FFFF) )
{
// valid UTF-32 encoding.
*unicode_code_point = sUTF32[0];
return 1;
}
// handle errors
if ( 0 == e )
return 0;
if ( sUTF32_count >= 2 && sUTF32[0] < 0xDC00 && sUTF32[1] >= 0xDC00 && sUTF32[1] < 0xE000 )
{
// UTF-16 surrogate pair appears in UTF-32 array
e->m_error_status |= 4;
if ( 0 == (4 & e->m_error_mask) )
return 0; // this error is not masked.
uhi = sUTF32[0];
ulo = sUTF32[1];
*unicode_code_point = (uhi-0xD800)*0x400 + (ulo-0xDC00) + 0x10000;
return 2; // error masked and reasonable value returned.
}
// bogus value
e->m_error_status |= 16;
if ( 16 != (16 & e->m_error_mask) || !ON_IsValidUnicodeCodePoint(e->m_error_code_point) )
{
// this error is not masked
return 0;
}
*unicode_code_point = e->m_error_code_point;
return 1; // error masked and e->m_error_code_point returnred.
}
int ON_DecodeSwapByteUTF32(
const ON__UINT32* sUTF32,
int sUTF32_count,
struct ON_UnicodeErrorParameters* e,
ON__UINT32* unicode_code_point
)
{
ON__UINT32 sUTF32swap[2];
ON_UnicodeErrorParameters local_e = ON_UnicodeErrorParameters::MaskErrors;
if (nullptr == e)
e = &local_e;
if ( 0 != sUTF32 && sUTF32_count > 0 )
{
sUTF32swap[0] = SwapBytes32(sUTF32[0]);
if ( sUTF32_count > 1 )
{
// Get up to 2 elements to pass to the unswapped
// decoder so that masked errors are uniformly
// handled.
sUTF32swap[1] = SwapBytes32(sUTF32[1]);
sUTF32_count = 2;
}
sUTF32 = sUTF32swap;
}
return ON_DecodeUTF32(sUTF32,sUTF32_count,e,unicode_code_point);
}
int ON_ConvertUTF32ToUTF8(
int bTestByteOrder,
const ON__UINT32* sUTF32,
int sUTF32_count,
char* sUTF8,
int sUTF8_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const ON__UINT32** sNextUTF32
)
{
int i, k, output_count, bSwapBytes;
ON__UINT32 u;
char s[6];
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sUTF32_count && 0 != sUTF32 )
{
for ( sUTF32_count = 0; 0 != sUTF32[sUTF32_count]; sUTF32_count++)
{
// empty for body
}
}
if ( 0 == sUTF32 || sUTF32_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF32 )
*sNextUTF32 = sUTF32;
return 0;
}
if ( 0 == sUTF8_count )
{
sUTF8 = 0;
sUTF8_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sUTF8 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF32 )
*sNextUTF32 = sUTF32;
return 0;
}
bSwapBytes = false;
if ( bTestByteOrder && sUTF32_count > 0 )
{
if ( 0x0000FEFF == sUTF32[0] )
{
// skip BOM
sUTF32_count--;
sUTF32++;
}
else if ( 0xFFFE0000 == sUTF32[0] )
{
// skip BOM and swap bytes in rest of sUTF32
bSwapBytes = true;
sUTF32_count--;
sUTF32++;
}
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
for ( i = 0; i < sUTF32_count; i++ )
{
u = bSwapBytes ? SwapBytes32(sUTF32[i]) : sUTF32[i];
if ( !ON_IsValidUnicodeCodePoint(u) )
{
e.m_error_status |= 16;
if ( 16 != (16 & e.m_error_mask) )
break;
if ( !ON_IsValidUnicodeCodePoint(e.m_error_code_point) )
break;
u = e.m_error_code_point;
}
k = ON_EncodeUTF8(u,s);
if ( 0 != sUTF8 )
{
if ( output_count + k > sUTF8_count )
{
e.m_error_status |= 2;
break;
}
memcpy(sUTF8+output_count,s,k*sizeof(sUTF8[0]));
}
output_count += k;
}
if ( 0 != sUTF8 && output_count < sUTF8_count)
sUTF8[output_count] = 0;
if ( sNextUTF32 )
*sNextUTF32 = sUTF32+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
int ON_ConvertUTF32ToUTF16(
int bTestByteOrder,
const ON__UINT32* sUTF32,
int sUTF32_count,
ON__UINT16* sUTF16,
int sUTF16_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const ON__UINT32** sNextUTF32
)
{
int i, k, output_count, bSwapBytes;
ON__UINT32 u;
ON__UINT16 w[2];
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sUTF32_count && 0 != sUTF32 )
{
for ( sUTF32_count = 0; 0 != sUTF32[sUTF32_count]; sUTF32_count++)
{
// empty for body
}
}
if ( 0 == sUTF32 || sUTF32_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF32 )
*sNextUTF32 = sUTF32;
return 0;
}
if ( 0 == sUTF16_count )
{
sUTF16 = 0;
sUTF16_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sUTF16 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF32 )
*sNextUTF32 = sUTF32;
return 0;
}
bSwapBytes = false;
if ( bTestByteOrder && sUTF32_count > 0 )
{
if ( 0x0000FEFF == sUTF32[0] )
{
// skip BOM
sUTF32_count--;
sUTF32++;
}
else if ( 0xFFFE0000 == sUTF32[0] )
{
// skip BOM and swap bytes in rest of sUTF32
bSwapBytes = true;
sUTF32_count--;
sUTF32++;
}
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
for ( i = 0; i < sUTF32_count; i++ )
{
u = bSwapBytes ? SwapBytes32(sUTF32[i]) : sUTF32[i];
if ( !ON_IsValidUnicodeCodePoint(u) )
{
e.m_error_status |= 16;
if ( 16 != (16 & e.m_error_mask) )
break;
if ( !ON_IsValidUnicodeCodePoint(e.m_error_code_point) )
break;
u = e.m_error_code_point;
}
k = ON_EncodeUTF16(u,w);
if ( 0 != sUTF16 )
{
if ( output_count + k > sUTF16_count )
{
e.m_error_status |= 2;
break;
}
sUTF16[output_count] = w[0];
if ( 2 == k )
sUTF16[output_count+1] = w[1];
}
output_count += k;
}
if ( 0 != sUTF16 && output_count < sUTF16_count)
sUTF16[output_count] = 0;
if ( sNextUTF32 )
*sNextUTF32 = sUTF32+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
int ON_ConvertUTF32ToUTF32(
int bTestByteOrder,
const ON__UINT32* sUTF16,
int sUTF16_count,
unsigned int* sUTF32,
int sUTF32_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const ON__UINT32** sNextUTF16
)
{
int i, j, output_count, bSwapBytes;
ON__UINT32 u;
struct ON_UnicodeErrorParameters e;
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sUTF16_count && 0 != sUTF16 )
{
for ( sUTF16_count = 0; 0 != sUTF16[sUTF16_count]; sUTF16_count++)
{
// empty for body
}
}
if ( 0 == sUTF16 || sUTF16_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16;
return 0;
}
if ( 0 == sUTF32_count )
{
sUTF32 = 0;
sUTF32_count = 2147483647; // maximum value of a 32-bit signed int
}
else if ( 0 == sUTF32 )
{
if ( 0 != error_status )
*error_status |= 1;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16;
return 0;
}
bSwapBytes = false;
if ( bTestByteOrder && sUTF16_count > 0 )
{
if ( 0x0000FEFF == sUTF16[0] )
{
// skip BOM
sUTF16_count--;
sUTF16++;
}
else if ( 0xFFFE0000 == sUTF16[0])
{
// skip BOM and swap bytes in rest of sUTF16
bSwapBytes = true;
sUTF16_count--;
sUTF16++;
}
}
e.m_error_status = 0;
e.m_error_mask = error_mask;
e.m_error_code_point = error_code_point;
output_count = 0;
if ( bSwapBytes )
{
for ( i = 0; i < sUTF16_count; i += j )
{
j = ON_DecodeSwapByteUTF32(sUTF16+i,sUTF16_count-i,&e,&u);
if ( j <= 0 )
break;
if ( 0 != sUTF32 )
{
if ( output_count >= sUTF32_count )
{
e.m_error_status |= 2;
break;
}
sUTF32[output_count] = u;
}
output_count++;
}
}
else
{
for ( i = 0; i < sUTF16_count; i += j )
{
j = ON_DecodeUTF32(sUTF16+i,sUTF16_count-i,&e,&u);
if ( j <= 0 )
break;
if ( 0 != sUTF32 )
{
if ( output_count >= sUTF32_count )
{
e.m_error_status |= 2;
break;
}
sUTF32[output_count] = u;
}
output_count++;
}
}
if ( 0 != sUTF32 && output_count < sUTF32_count)
sUTF32[output_count] = 0;
if ( sNextUTF16 )
*sNextUTF16 = sUTF16+i;
if ( error_status )
*error_status = e.m_error_status;
return output_count;
}
int ON_ConvertWideCharToUTF8(
int bTestByteOrder,
const wchar_t* sWideChar,
int sWideChar_count,
char* sUTF8,
int sUTF8_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const wchar_t** sNextWideChar
)
{
int rc;
switch(sizeof(sWideChar[0]))
{
case sizeof(char):
// assume wchar_t strings are UTF-8 encoded
rc = ON_ConvertUTF8ToUTF8(
bTestByteOrder,
(const char*)sWideChar,sWideChar_count,
sUTF8,sUTF8_count,
error_status,error_mask,error_code_point,
(const char**)sNextWideChar
);
break;
case sizeof(ON__UINT16):
// assume wchar_t strings are UTF-16 encoded
rc = ON_ConvertUTF16ToUTF8(
bTestByteOrder,
(const ON__UINT16*)sWideChar,sWideChar_count,
sUTF8,sUTF8_count,
error_status,error_mask,error_code_point,
(const ON__UINT16**)sNextWideChar
);
break;
case sizeof(ON__UINT32):
// assume wchar_t strings are UTF-32 encoded
rc = ON_ConvertUTF32ToUTF8(
bTestByteOrder,
(const ON__UINT32*)sWideChar,sWideChar_count,
sUTF8,sUTF8_count,
error_status,error_mask,error_code_point,
(const ON__UINT32**)sNextWideChar
);
break;
default:
rc = 0;
}
return rc;
}
int ON_ConvertWideCharToUTF16(
int bTestByteOrder,
const wchar_t* sWideChar,
int sWideChar_count,
char* sUTF16,
int sUTF16_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const wchar_t** sNextWideChar
)
{
int rc;
switch(sizeof(sWideChar[0]))
{
case sizeof(char):
// assume wchar_t strings are UTF-8 encoded
rc = ON_ConvertUTF8ToUTF16(
bTestByteOrder,
(const char*)sWideChar,sWideChar_count,
(ON__UINT16*)sUTF16,sUTF16_count,
error_status,error_mask,error_code_point,
(const char**)sNextWideChar
);
break;
case sizeof(ON__UINT16):
// assume wchar_t strings are UTF-16 encoded
rc = ON_ConvertUTF16ToUTF16(
bTestByteOrder,
(const ON__UINT16*)sWideChar,sWideChar_count,
(ON__UINT16*)sUTF16,sUTF16_count,
error_status,error_mask,error_code_point,
(const ON__UINT16**)sNextWideChar
);
break;
case sizeof(ON__UINT32):
// assume wchar_t strings are UTF-32 encoded
rc = ON_ConvertUTF32ToUTF16(
bTestByteOrder,
(const ON__UINT32*)sWideChar,sWideChar_count,
(ON__UINT16*)sUTF16,sUTF16_count,
error_status,error_mask,error_code_point,
(const ON__UINT32**)sNextWideChar
);
break;
default:
rc = 0;
}
return rc;
}
int ON_ConvertWideCharToUTF32(
int bTestByteOrder,
const wchar_t* sWideChar,
int sWideChar_count,
ON__UINT32* sUTF32,
int sUTF32_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const wchar_t** sNextWideChar
)
{
int rc;
switch(sizeof(sWideChar[0]))
{
case sizeof(char):
// assume wchar_t strings are UTF-8 encoded
rc = ON_ConvertUTF8ToUTF32(
bTestByteOrder,
(const char*)sWideChar,sWideChar_count,
sUTF32,sUTF32_count,
error_status,error_mask,error_code_point,
(const char**)sNextWideChar
);
break;
case sizeof(ON__UINT16):
// assume wchar_t strings are UTF-16 encoded
rc = ON_ConvertUTF16ToUTF32(
bTestByteOrder,
(const ON__UINT16*)sWideChar,sWideChar_count,
sUTF32,sUTF32_count,
error_status,error_mask,error_code_point,
(const ON__UINT16**)sNextWideChar
);
break;
case sizeof(ON__UINT32):
// assume wchar_t strings are UTF-32 encoded
rc = ON_ConvertUTF32ToUTF32(
bTestByteOrder,
(const ON__UINT32*)sWideChar,sWideChar_count,
sUTF32,sUTF32_count,
error_status,error_mask,error_code_point,
(const ON__UINT32**)sNextWideChar
);
break;
default:
rc = 0;
}
return rc;
}
int ON_ConvertUTF8ToWideChar(
int bTestByteOrder,
const char* sUTF8,
int sUTF8_count,
wchar_t* sWideChar,
int sWideChar_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const char** sNextUTF8
)
{
int rc;
switch(sizeof(sWideChar[0]))
{
case sizeof(char):
// assume wchar_t strings are UTF-8 encoded
rc = ON_ConvertUTF8ToUTF8(
bTestByteOrder,
sUTF8,sUTF8_count,
(char*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
sNextUTF8
);
break;
case sizeof(ON__UINT16):
// assume wchar_t strings are UTF-16 encoded
rc = ON_ConvertUTF8ToUTF16(
bTestByteOrder,
sUTF8,sUTF8_count,
(ON__UINT16*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
sNextUTF8
);
break;
case sizeof(ON__UINT32):
// assume wchar_t strings are UTF-32 encoded
rc = ON_ConvertUTF8ToUTF32(
bTestByteOrder,
sUTF8,sUTF8_count,
(ON__UINT32*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
sNextUTF8
);
break;
default:
if (error_status)
*error_status = 1;
if (sNextUTF8)
*sNextUTF8 = sUTF8;
rc = 0;
}
return rc;
}
int ON_ConvertUTF16ToWideChar(
int bTestByteOrder,
const ON__UINT16* sUTF16,
int sUTF16_count,
wchar_t* sWideChar,
int sWideChar_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const ON__UINT16** sNextUTF16
)
{
int rc;
switch(sizeof(sWideChar[0]))
{
case sizeof(char):
// assume wchar_t strings are UTF-8 encoded
rc = ON_ConvertUTF16ToUTF8(
bTestByteOrder,
(const ON__UINT16*)sUTF16,sUTF16_count,
(char*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
(const ON__UINT16**)sNextUTF16
);
break;
case sizeof(ON__UINT16):
// assume wchar_t strings are UTF-16 encoded
rc = ON_ConvertUTF16ToUTF16(
bTestByteOrder,
(const ON__UINT16*)sUTF16,sUTF16_count,
(ON__UINT16*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
(const ON__UINT16**)sNextUTF16
);
break;
case sizeof(ON__UINT32):
// assume wchar_t strings are UTF-32 encoded
rc = ON_ConvertUTF16ToUTF32(
bTestByteOrder,
(const ON__UINT16*)sUTF16,sUTF16_count,
(ON__UINT32*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
(const ON__UINT16**)sNextUTF16
);
break;
default:
if (error_status)
*error_status = 1;
if (sNextUTF16)
*sNextUTF16 = sUTF16;
rc = 0;
}
return rc;
}
int ON_ConvertUTF32ToWideChar(
int bTestByteOrder,
const ON__UINT32* sUTF32,
int sUTF32_count,
wchar_t* sWideChar,
int sWideChar_count,
unsigned int* error_status,
unsigned int error_mask,
ON__UINT32 error_code_point,
const ON__UINT32** sNextUTF32
)
{
int rc;
switch(sizeof(sWideChar[0]))
{
case sizeof(char):
// assume wchar_t strings are UTF-8 encoded
rc = ON_ConvertUTF32ToUTF8(
bTestByteOrder,
(const ON__UINT32*)sUTF32,sUTF32_count,
(char*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
(const ON__UINT32**)sNextUTF32
);
break;
case sizeof(ON__UINT16):
// assume wchar_t strings are UTF-16 encoded
rc = ON_ConvertUTF32ToUTF16(
bTestByteOrder,
(const ON__UINT32*)sUTF32,sUTF32_count,
(ON__UINT16*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
(const ON__UINT32**)sNextUTF32
);
break;
case sizeof(ON__UINT32):
// assume wchar_t strings are UTF-32 encoded
rc = ON_ConvertUTF32ToUTF32(
bTestByteOrder,
(const ON__UINT32*)sUTF32,sUTF32_count,
(ON__UINT32*)sWideChar,sWideChar_count,
error_status,error_mask,error_code_point,
(const ON__UINT32**)sNextUTF32
);
break;
default:
if (error_status)
*error_status = 1;
if (sNextUTF32)
*sNextUTF32 = sUTF32;
rc = 0;
}
return rc;
}
const ON_wString ON_wString::FromUnicodeCodePoints(
const ON__UINT32* code_points,
int code_point_count,
ON__UINT32 error_code_point
)
{
const bool bErrorCodePointIsValid = ON_IsValidUnicodeCodePoint(error_code_point);
if (nullptr == code_points)
return ON_wString::EmptyString;
if (-1 == code_point_count)
{
code_point_count = 0;
while (0 != code_points[code_point_count])
{
if (
false == bErrorCodePointIsValid
&& false == ON_IsValidUnicodeCodePoint(code_points[code_point_count])
)
{
break;
}
code_point_count++;
}
}
if ( code_point_count <= 0 )
return ON_wString::EmptyString;
const int bTestByteOrder = false;
unsigned int error_status = 0;
const unsigned int error_mask = bErrorCodePointIsValid ? 0xFFFFFFFF : 0;
int wchar_count = ON_ConvertUTF32ToWideChar(
bTestByteOrder,
code_points,
code_point_count,
nullptr,
0,
&error_status,
error_mask,
error_code_point,
nullptr
);
if (wchar_count <= 0)
return ON_wString::EmptyString;
ON_wString s;
const int s_capacity = (wchar_count + 1);
wchar_t* a = s.ReserveArray((size_t)s_capacity);
error_status = 0;
wchar_count = ON_ConvertUTF32ToWideChar(
bTestByteOrder,
code_points,
code_point_count,
a,
s_capacity,
&error_status,
error_mask,
error_code_point,
nullptr
);
if (wchar_count <= 0)
return ON_wString::EmptyString;
s.SetLength(wchar_count);
return s;
}
////int ON_ConvertWindowsCodePageValueToWideChar(
//// int windows_code_page,
//// ON__UINT32 code_page_character_value,
//// size_t w_capacity,
//// wchar_t* w
////)
////{
//// ON__UINT32 unicode_code_point = ON_UnicodeCodePoint::ON_ReplacementCharacter;
//// ON_UnicodeErrorParameters e;
//// memset(&e, 0, sizeof(e));
//// e.m_error_mask = 0xFF;
//// e.m_error_code_point = ON_UnicodeCodePoint::ON_ReplacementCharacter;
//// ON_DecodeWindowsCodePageValue( windows_code_page, code_page_character_value, &e, &unicode_code_point);
//// return ON_EncodeWideChar(unicode_code_point, w_capacity, w);
////}
ON__UINT32 ON_MapRTFcharsetToWindowsCodePage(
ON__UINT32 rtf_charset,
ON__UINT32 default_code_page
)
{
// From the Microsoft version of the RTF ver 1.9 spec available on MSDN
//
// \fcharsetN: Specifies the character set of a font in the font table.If this appears, it implies that bytes in runs
// tagged with the associated \fN are character codes in the codepage corresponding to the charset N.
// Use this codepage to convert the codes to Unicode using a function like the Windows MultiByteToWideChar().
// See also the \cpgN control word, which, if it appears, supersedes the codepage given by \fcharsetN.Values for N are defined,
// for example, in the Windows header file wingdi.h(e.g., see ANSI_CHARSET) and are repeated here together with the corresponding
// Windows or Mac codepages for convenience:charset codepage Windows / Mac name
// A font may have a different character set from the character set of the document. For example, the Symbol font has the
// same characters in the same code positions both on the Macintosh and in Windows. Typically, RTF fonts use the code page
// corresponding to the \fcharsetN control word in their \fonttbl description. If the charset doesn<73>t exist, the codepage
// may be given by the \cpgN control word, for which the code page is N. If the \cpgN does appear, it supersedes the code
// page corresponding to the \fcharsetN.
// For such cases, codepage conversions can be avoided altogether by using the Unicode \uN notation for characters.
// In addition, file names (used in field instructions and in embedded fonts) may not necessarily be the same as the character
// set of the document; the \cpgN control word can change the character set for these file names as well.
//
ON__UINT32 cp;
switch (rtf_charset)
{
case 0: cp = 1252; break; // ANSI
case 1: cp = 0; break; // Default
case 2: cp = 42; break; // Symbol
case 77: cp = 10000; break; // Mac Roman
case 78: cp = 10001; break; // Mac Shift Jis
case 79: cp = 10003; break; // Mac Hangul
case 80: cp = 10008; break; // Mac GB2312
case 81: cp = 10002; break; // Mac Big5
case 82: cp = default_code_page; break; // Mac Johab (old)
case 83: cp = 10005; break; // Mac Hebrew
case 84: cp = 10004; break; // Mac Arabic
case 85: cp = 10006; break; // Mac Greek
case 86: cp = 10081; break; // Mac Turkish
case 87: cp = 10021; break; // Mac Thai
case 88: cp = 10029; break; // Mac East Europe
case 89: cp = 10007; break; // Mac Russian
case 128: cp = 932; break; // Shift JIS
case 129: cp = 949; break; // Hangul (Korean)
case 130: cp = 1361; break; // Johab
case 134: cp = 936; break; // GB2312
case 136: cp = 950; break; // Big5
case 161: cp = 1253; break; // Greek
case 162: cp = 1254; break; // Turkish
case 163: cp = 1258; break; // Vietnamese
case 177: cp = 1255; break; // Hebrew
case 178: cp = 1256; break; // Arabic
case 179: cp = default_code_page; break; // Arabic Traditional (old)
case 180: cp = default_code_page; break; // Arabic user (old)
case 181: cp = default_code_page; break; // Hebrew user (old)
case 186: cp = 1257; break; // Baltic
case 204: cp = 1251; break; // Russian
case 222: cp = 874; break; // Thai
case 238: cp = 1250; break; // Eastern European
case 254: cp = 437; break; // PC 437
case 255: cp = 850; break; // OEM
default: cp = default_code_page; break;
}
return cp;
}
static int ON_Internal_ConvertMSSBCPToWideChar(
const ON__UINT32* sb_code_page_0x80_to_0xFF_to_unicode,
const char* sMBCS,
int sMBCS_count,
wchar_t* sWideChar,
int sWideChar_capacity,
unsigned int* error_status
)
{
wchar_t* sWideCharMax
= (sWideChar_capacity > 0 && nullptr != sWideChar)
? sWideChar + sWideChar_capacity
: nullptr;
if (nullptr == sWideCharMax)
{
sWideChar = nullptr;
sWideChar_capacity = 0;
}
else
{
sWideChar[0] = 0;
}
if (nullptr != error_status)
*error_status = 0;
unsigned int e = 0;
if (nullptr == sMBCS || sMBCS_count < 0)
sMBCS_count = 0;
wchar_t* s = sWideChar;
wchar_t w_buffer[8];
int rc = 0;
for (int i = 0; i < sMBCS_count; i++)
{
const ON__UINT32 c = (unsigned char)sMBCS[i];
ON__UINT32 unicode_code_point;
if (c < 0x80)
unicode_code_point = c;
else
{
if (c <= 0xFF && nullptr != sb_code_page_0x80_to_0xFF_to_unicode )
{
unicode_code_point = sb_code_page_0x80_to_0xFF_to_unicode[c - 0x80];
if (0 == ON_IsValidUnicodeCodePoint(unicode_code_point))
unicode_code_point = ON_UnicodeCodePoint::ON_ReplacementCharacter;
}
else
unicode_code_point = ON_UnicodeCodePoint::ON_ReplacementCharacter;
if ( ON_UnicodeCodePoint::ON_ReplacementCharacter == unicode_code_point )
e |= 16;
}
const int w_count = ON_EncodeWideChar(unicode_code_point, sizeof(w_buffer)/sizeof(w_buffer[0]), w_buffer);
if (w_count <= 0)
{
e |= 16;
continue;
}
rc += w_count;
if (s == nullptr)
continue;
wchar_t* s1 = s + w_count;
if (s1 > sWideCharMax)
{
e |= 2;
continue;
}
const wchar_t* w = w_buffer;
while (s < s1)
*s++ = *w++;
}
while (s < sWideCharMax)
{
*s++ = 0;
}
if (nullptr != error_status)
*error_status = e;
return rc;
}
int ON_ConvertMSMBCPToWideChar(
ON__UINT32 windows_code_page,
const char* sMBCS,
int sMBCS_count,
wchar_t* sWideChar,
int sWideChar_capacity,
unsigned int* error_status
)
{
if ( 0 != error_status )
*error_status = 0;
if ( -1 == sMBCS_count && nullptr != sMBCS )
{
for ( sMBCS_count = 0; 0 != sMBCS[sMBCS_count]; sMBCS_count++)
{
// empty for body
}
}
if ( nullptr == sMBCS || sMBCS_count < 0 )
{
if ( 0 != error_status )
*error_status |= 1;
return 0;
}
if ( 0 == sMBCS_count )
{
return 0;
}
if (sWideChar_capacity <= 0)
{
sWideChar_capacity = 0;
sWideChar = nullptr;
}
else if (nullptr == sWideChar)
{
sWideChar_capacity = 0;
}
else
{
sWideChar[0] = 0;
}
const char* c = sMBCS;
const char* c1 = c + sMBCS_count;
wchar_t* w = sWideChar;
wchar_t* w1 = w + sWideChar_capacity;
while (c < c1 && *c >= 0 && *c <= 127)
{
if (nullptr != w)
{
if (w >= w1)
break;
*w++ = (wchar_t)*c;
}
c++;
}
if (c == c1)
{
if (w < w1)
*w = 0;
return sMBCS_count;
}
const ON__UINT32* sb_code_page_0x80_to_0xFF_to_unicode = ON_MSSBCP_0x80_0xFF_Unicode(windows_code_page);
if (nullptr != sb_code_page_0x80_to_0xFF_to_unicode)
{
// fast platform independent single byte code page conversion built into opennurbs
return ON_Internal_ConvertMSSBCPToWideChar(
sb_code_page_0x80_to_0xFF_to_unicode,
sMBCS,
sMBCS_count,
sWideChar,
sWideChar_capacity,
error_status
);
}
#if defined(ON_RUNTIME_WIN)
// Starting with Windows Vista, the function does not drop illegal code points when dwFlags=0.
// It replaces illegal sequences with U+FFFD (encoded as appropriate for the specified codepage).
DWORD dwFlags = 0;
int sWideChar_count = ::MultiByteToWideChar(windows_code_page, dwFlags, sMBCS, sMBCS_count, sWideChar, sWideChar_capacity);
if (sWideChar_count < 0)
sWideChar_count = 0;
if (nullptr == sWideChar)
return sWideChar_count;
for (int i = 0; i < sWideChar_count; i++)
{
if (0 == sWideChar[i])
{
sWideChar_count = i;
break;
}
if ( ON_wString::ReplacementCharacter == sWideChar[i] )
{
if ( nullptr != error_status)
*error_status |= 16;
}
}
if (sWideChar_count < sWideChar_capacity)
sWideChar[sWideChar_count] = 0;
return sWideChar_count;
#else
// Add support for Mac if needed.
// Shift JIS, Hangol, and Big 5 are likely candidates.
// These are encodings with either 1 or 2 bytes per glyph.
if (949 == windows_code_page)
{
}
return 0;
#endif
}