c434cad322
Files correlati : Ricompilazione Demo : [ ] Commento : Riportata la versione 3.1 patch 650 git-svn-id: svn://10.65.10.50/trunk@14148 c028cbd2-c16b-5b4b-a496-9718f37d4682
1689 lines
51 KiB
C
Executable File
1689 lines
51 KiB
C
Executable File
/*---------------------------------------------------------------------------*
|
|
| PDFlib - A library for generating PDF on the fly |
|
|
+---------------------------------------------------------------------------+
|
|
| Copyright (c) 1997-2005 Thomas Merz and PDFlib GmbH. All rights reserved. |
|
|
+---------------------------------------------------------------------------+
|
|
| |
|
|
| This software is subject to the PDFlib license. It is NOT in the |
|
|
| public domain. Extended versions and commercial licenses are |
|
|
| available, please check http://www.pdflib.com. |
|
|
| |
|
|
*---------------------------------------------------------------------------*/
|
|
|
|
/* $Id: pc_unicode.c,v 1.2 2006-07-11 13:10:33 alex Exp $
|
|
*
|
|
* PDFlib routines for converting between Unicode values and Adobe glyph names
|
|
*
|
|
*/
|
|
|
|
#include "pc_util.h"
|
|
#include "pc_chartabs.h"
|
|
|
|
#if defined(WIN32)
|
|
#define WIN32_LEAN_AND_MEAN
|
|
#include <windows.h>
|
|
#endif /* WIN32 */
|
|
|
|
|
|
/*
|
|
* Returns the Unicode value of a Adobe glyph name.
|
|
* If the name is not contained in the Adobe Glyph List
|
|
* (AGL 1.2) -1 will be returned.
|
|
*/
|
|
|
|
int
|
|
pdc_adobe2unicode(const char *name)
|
|
{
|
|
int lo = 0;
|
|
int hi = ((sizeof tab_agl2uni) / (sizeof (pdc_glyph_tab)));
|
|
|
|
if (name)
|
|
{
|
|
while (lo < hi)
|
|
{
|
|
int i = (lo + hi) / 2;
|
|
int cmp = strcmp(name, tab_agl2uni[i].glyphname);
|
|
|
|
if (cmp == 0)
|
|
return (int) tab_agl2uni[i].code;
|
|
|
|
if (cmp < 0)
|
|
hi = i;
|
|
else
|
|
lo = i + 1;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Returns the name in the Adobe Glyph List which corresponds to
|
|
* the supplied Unicode value. If the value doesn't have a
|
|
* corresponding Unicode name NULL will be returned.
|
|
*/
|
|
|
|
const char *
|
|
pdc_unicode2adobe(pdc_ushort uv)
|
|
{
|
|
int lo = 0;
|
|
int hi = ((sizeof tab_uni2agl) / (sizeof (pdc_glyph_tab)));
|
|
|
|
if (uv)
|
|
{
|
|
while (lo < hi)
|
|
{
|
|
int i = (lo + hi) / 2;
|
|
|
|
if (uv == tab_uni2agl[i].code)
|
|
return tab_uni2agl[i].glyphname;
|
|
|
|
if (uv < tab_uni2agl[i].code)
|
|
hi = i;
|
|
else
|
|
lo = i + 1;
|
|
}
|
|
}
|
|
|
|
/* C0 and C1 control characters.
|
|
* They have never a graphical representation.
|
|
*/
|
|
if (uv < 0x0020 || (uv >= 0x007F && uv <= 0x009F))
|
|
return (char *) glyph__notdef;
|
|
|
|
lo = 0;
|
|
hi = ((sizeof tab_uni2zadb) / (sizeof (pdc_glyph_tab)));
|
|
|
|
if (uv)
|
|
{
|
|
while (lo < hi)
|
|
{
|
|
int i = (lo + hi) / 2;
|
|
|
|
if (uv == tab_uni2agl[i].code)
|
|
return tab_uni2zadb[i].glyphname;
|
|
|
|
if (uv < tab_uni2zadb[i].code)
|
|
hi = i;
|
|
else
|
|
lo = i + 1;
|
|
}
|
|
}
|
|
|
|
return (char *) 0;
|
|
}
|
|
|
|
const char *
|
|
pdc_get_notdef_glyphname(void)
|
|
{
|
|
return (char *) glyph__notdef;
|
|
}
|
|
|
|
/*
|
|
* Returns the Unicode value of a ZapfDingbats glyph name.
|
|
* If the name is not contained in the ZapfDingbats list
|
|
* -1 will be returned.
|
|
*/
|
|
|
|
int
|
|
pdc_zadb2unicode(const char *name)
|
|
{
|
|
int lo = 0;
|
|
int hi = ((sizeof tab_zadb2uni) / (sizeof (pdc_glyph_tab)));
|
|
|
|
if (name)
|
|
{
|
|
while (lo < hi)
|
|
{
|
|
int i = (lo + hi) / 2;
|
|
int cmp = strcmp(name, tab_zadb2uni[i].glyphname);
|
|
|
|
if (cmp == 0)
|
|
return (int) tab_zadb2uni[i].code;
|
|
|
|
if (cmp < 0)
|
|
hi = i;
|
|
else
|
|
lo = i + 1;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Returns the name in the ZapfDingbats font which corresponds to
|
|
* the supplied Unicode value. If the value doesn't have a
|
|
* corresponding Unicode name NULL will be returned.
|
|
*/
|
|
|
|
const char *
|
|
pdc_unicode2zadb(pdc_ushort uv)
|
|
{
|
|
int lo = 0;
|
|
int hi = ((sizeof tab_uni2zadb) / (sizeof (pdc_glyph_tab)));
|
|
|
|
if (uv)
|
|
{
|
|
while (lo < hi)
|
|
{
|
|
int i = (lo + hi) / 2;
|
|
|
|
if (uv == tab_uni2zadb[i].code)
|
|
return tab_uni2zadb[i].glyphname;
|
|
|
|
if (uv < tab_uni2zadb[i].code)
|
|
hi = i;
|
|
else
|
|
lo = i + 1;
|
|
}
|
|
}
|
|
|
|
return (char *) 0;
|
|
}
|
|
|
|
|
|
pdc_ushort
|
|
pdc_string2unicode(pdc_core *pdc, const char *text, int i_flags,
|
|
const pdc_keyconn *keyconn, pdc_bool verbose)
|
|
{
|
|
pdc_ushort uv = PDC_UNICODE_NOTCHAR;
|
|
int flags = PDC_INT_UNSIGNED | PDC_INT_SHORT;
|
|
int i = 0;
|
|
|
|
(void) pdc;
|
|
(void) verbose;
|
|
|
|
if (!strncmp(text, "U+", 2))
|
|
{
|
|
flags |= PDC_INT_HEXADEC;
|
|
i = 2;
|
|
}
|
|
|
|
if (i_flags & PDC_INT_CASESENS)
|
|
flags |= PDC_INT_CASESENS;
|
|
|
|
if (!pdc_get_integerkeycode(&text[i], keyconn, flags, &uv) && !i)
|
|
{
|
|
uv = PDC_UNICODE_NOTCHAR;
|
|
}
|
|
|
|
return uv;
|
|
}
|
|
|
|
|
|
/*
|
|
* Returns true if a character name is contained in pc_standard_latin_charset.
|
|
* Otherwise false will be returned.
|
|
*/
|
|
|
|
pdc_bool
|
|
pdc_is_std_charname(const char *name)
|
|
{
|
|
int lo = 0;
|
|
int hi = ((sizeof pc_standard_latin_charset) / (sizeof (char *)));
|
|
|
|
if (name)
|
|
{
|
|
while (lo < hi)
|
|
{
|
|
int i = (lo + hi) / 2;
|
|
int cmp = strcmp(name, pc_standard_latin_charset[i]);
|
|
|
|
if (cmp == 0)
|
|
return pdc_true;
|
|
|
|
if (cmp < 0)
|
|
hi = i;
|
|
else
|
|
lo = i + 1;
|
|
}
|
|
}
|
|
|
|
return pdc_false;
|
|
}
|
|
|
|
/*
|
|
* Replacement of a Unicode character
|
|
* by a typographically equivalent Unicode character
|
|
* or at least by SPACE
|
|
*
|
|
*/
|
|
|
|
pdc_ushort
|
|
pdc_get_equi_unicode(pdc_ushort uv)
|
|
{
|
|
int i = 0;
|
|
|
|
while (pc_equi_unicodes[i])
|
|
{
|
|
if (pc_equi_unicodes[i] == uv)
|
|
return pc_equi_unicodes[i+1];
|
|
i += 2;
|
|
}
|
|
|
|
/* Fullwidth ASCII variants */
|
|
if (uv > 0xFF00 && uv < 0xFF5E)
|
|
{
|
|
uv -= 0xFF00 - 0x0020;
|
|
return uv;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
* Deletes a bit in a bit mask. The bit indicates that
|
|
* the respective glyph name of AGL 2.0 is not available
|
|
* in a PostScript font. The glyph name is used to avoid
|
|
* ambiguities (see comment in pc_chartabs.h)
|
|
*
|
|
*/
|
|
|
|
#define PDC_BIT_NBSP (1L<<0)
|
|
#define PDC_BIT_SHY (1L<<1)
|
|
#define PDC_BIT_MODMACRON (1L<<2)
|
|
#define PDC_BIT_CAPDELTA (1L<<3)
|
|
#define PDC_BIT_CAPOMEGA (1L<<4)
|
|
#define PDC_BIT_DIVSLASH (1L<<5)
|
|
#define PDC_BIT_BULLETOP (1L<<6)
|
|
#define PDC_BIT_SMALLMU (1L<<7)
|
|
|
|
void
|
|
pdc_delete_missingglyph_bit(pdc_ushort uv, pdc_ulong *bmask)
|
|
{
|
|
switch(uv)
|
|
{
|
|
case PDC_UNICODE_NBSP:
|
|
*bmask &= ~PDC_BIT_NBSP;
|
|
return;
|
|
|
|
case PDC_UNICODE_SHY:
|
|
*bmask &= ~PDC_BIT_SHY;
|
|
return;
|
|
|
|
case PDC_UNICODE_MODMACRON:
|
|
*bmask &= ~PDC_BIT_MODMACRON;
|
|
return;
|
|
|
|
case PDC_UNICODE_CAPDELTA:
|
|
*bmask &= ~PDC_BIT_CAPDELTA;
|
|
return;
|
|
|
|
case PDC_UNICODE_CAPOMEGA:
|
|
*bmask &= ~PDC_BIT_CAPOMEGA;
|
|
return;
|
|
|
|
case PDC_UNICODE_DIVSLASH:
|
|
*bmask &= ~PDC_BIT_DIVSLASH;
|
|
return;
|
|
|
|
case PDC_UNICODE_BULLETOP:
|
|
*bmask &= ~PDC_BIT_BULLETOP;
|
|
return;
|
|
|
|
case PDC_UNICODE_SMALLMU:
|
|
*bmask &= ~PDC_BIT_SMALLMU;
|
|
return;
|
|
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Returnes an alternative Unicode value and/or glyph name for an
|
|
* AGL 2.0 glyph name which is not available in a PostScript font.
|
|
*
|
|
*/
|
|
|
|
pdc_ushort
|
|
pdc_get_alter_glyphname(pdc_ushort uv, pdc_ulong bmask, char **glyphname)
|
|
{
|
|
switch(uv)
|
|
{
|
|
case PDC_UNICODE_NBSP:
|
|
if (bmask & PDC_BIT_NBSP)
|
|
{
|
|
if (glyphname)
|
|
*glyphname = (char *) glyph_space;
|
|
return PDC_UNICODE_SPACE;
|
|
}
|
|
break;
|
|
|
|
case PDC_UNICODE_SHY:
|
|
if (bmask & PDC_BIT_SHY)
|
|
{
|
|
if (glyphname)
|
|
*glyphname = (char *) glyph_hyphen;
|
|
return PDC_UNICODE_HYPHEN;
|
|
}
|
|
break;
|
|
|
|
case PDC_UNICODE_MODMACRON:
|
|
if (bmask & PDC_BIT_MODMACRON)
|
|
{
|
|
if (glyphname)
|
|
*glyphname = (char *) glyph_macron;
|
|
return PDC_UNICODE_MACRON;
|
|
}
|
|
break;
|
|
|
|
case PDC_UNICODE_CAPDELTA:
|
|
if (bmask & PDC_BIT_CAPDELTA)
|
|
{
|
|
if (glyphname)
|
|
*glyphname = (char *) glyph_Delta;
|
|
return PDC_UNICODE_INCREMENT;
|
|
}
|
|
break;
|
|
|
|
case PDC_UNICODE_CAPOMEGA:
|
|
if (bmask & PDC_BIT_CAPOMEGA)
|
|
{
|
|
if (glyphname)
|
|
*glyphname = (char *) glyph_Omega;
|
|
return PDC_UNICODE_OHMSIGN;
|
|
}
|
|
break;
|
|
|
|
case PDC_UNICODE_DIVSLASH:
|
|
if (bmask & PDC_BIT_DIVSLASH)
|
|
{
|
|
if (glyphname)
|
|
*glyphname = (char *) glyph_fraction;
|
|
return PDC_UNICODE_FRACSLASH;
|
|
}
|
|
|
|
case PDC_UNICODE_BULLETOP:
|
|
if (bmask & PDC_BIT_BULLETOP)
|
|
{
|
|
if (glyphname)
|
|
*glyphname = (char *) glyph_periodcentered;
|
|
return PDC_UNICODE_MIDDLEDOT;
|
|
}
|
|
|
|
case PDC_UNICODE_SMALLMU:
|
|
if (bmask & PDC_BIT_SMALLMU)
|
|
{
|
|
if (glyphname)
|
|
*glyphname = (char *) glyph_mu;
|
|
return PDC_UNICODE_MICRO;
|
|
}
|
|
|
|
default:
|
|
if (glyphname)
|
|
{
|
|
if (*glyphname == NULL)
|
|
*glyphname = (char *) pdc_get_notdef_glyphname();
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return uv;
|
|
}
|
|
|
|
|
|
/*
|
|
* The following source is based on Unicode's original source
|
|
* code ConvertUTF.c. It has been adapted to PDFlib programming
|
|
* conventions.
|
|
*
|
|
* The original file had the following notice:
|
|
*
|
|
* Copyright 2001 Unicode, Inc.
|
|
*
|
|
* Limitations on Rights to Redistribute This Code
|
|
*
|
|
* Author: Mark E. Davis, 1994.
|
|
* Rev History: Rick McGowan, fixes & updates May 2001.
|
|
*
|
|
*
|
|
* Functions for conversions between UTF32, UTF-16, and UTF-8.
|
|
* These funtions forming a complete set of conversions between
|
|
* the three formats. UTF-7 is not included here.
|
|
*
|
|
* Each of these routines takes pointers to input buffers and output
|
|
* buffers. The input buffers are const.
|
|
*
|
|
* Each routine converts the text between *sourceStart and sourceEnd,
|
|
* putting the result into the buffer between *targetStart and
|
|
* targetEnd. Note: the end pointers are *after* the last item: e.g.
|
|
* *(sourceEnd - 1) is the last item.
|
|
*
|
|
* The return result indicates whether the conversion was successful,
|
|
* and if not, whether the problem was in the source or target buffers.
|
|
* (Only the first encountered problem is indicated.)
|
|
*
|
|
* After the conversion, *sourceStart and *targetStart are both
|
|
* updated to point to the end of last text successfully converted in
|
|
* the respective buffers.
|
|
*
|
|
* Input parameters:
|
|
* sourceStart - pointer to a pointer to the source buffer.
|
|
* The contents of this are modified on return so that
|
|
* it points at the next thing to be converted.
|
|
* targetStart - similarly, pointer to pointer to the target buffer.
|
|
* sourceEnd, targetEnd - respectively pointers to the ends of the
|
|
* two buffers, for overflow checking only.
|
|
*
|
|
* These conversion functions take a pdc_convers_flags argument. When this
|
|
* flag is set to strict, both irregular sequences and isolated surrogates
|
|
* will cause an error. When the flag is set to lenient, both irregular
|
|
* sequences and isolated surrogates are converted.
|
|
*
|
|
* Whether the flag is strict or lenient, all illegal sequences will cause
|
|
* an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
|
|
* or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
|
|
* must check for illegal sequences.
|
|
*
|
|
* When the flag is set to lenient, characters over 0x10FFFF are converted
|
|
* to the replacement character; otherwise (when the flag is set to strict)
|
|
* they constitute an error.
|
|
*
|
|
* Output parameters:
|
|
* The value "sourceIllegal" is returned from some routines if the input
|
|
* sequence is malformed. When "sourceIllegal" is returned, the source
|
|
* value will point to the illegal value that caused the problem. E.g.,
|
|
* in UTF-8 when a sequence is malformed, it points to the start of the
|
|
* malformed sequence.
|
|
*
|
|
* Author: Mark E. Davis, 1994.
|
|
* Rev History: Rick McGowan, fixes & updates May 2001.
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* The following 4 definitions are compiler-specific.
|
|
* The C standard does not guarantee that wchar_t has at least
|
|
* 16 bits, so wchar_t is no less portable than unsigned short!
|
|
* All should be unsigned values to avoid sign extension during
|
|
* bit mask & shift operations.
|
|
*/
|
|
|
|
typedef unsigned long UTF32; /* at least 32 bits */
|
|
typedef unsigned short UTF16; /* at least 16 bits */
|
|
typedef unsigned char UTF8; /* typically 8 bits */
|
|
|
|
/* Some fundamental constants */
|
|
#define UNI_SUR_HIGH_START (UTF32)0xD800
|
|
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
|
|
#define UNI_SUR_LOW_START (UTF32)0xDC00
|
|
#define UNI_SUR_LOW_END (UTF32)0xDFFF
|
|
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
|
|
#define UNI_MAX_BMP (UTF32)0x0000FFFF
|
|
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
|
|
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
|
|
|
|
static const int halfShift = 10; /* used for shifting by 10 bits */
|
|
|
|
static const UTF32 halfBase = 0x0010000UL;
|
|
static const UTF32 halfMask = 0x3FFUL;
|
|
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
#if 0
|
|
static pdc_convers_result
|
|
pdc_convertUTF32toUTF16 (
|
|
UTF32** sourceStart, const UTF32* sourceEnd,
|
|
UTF16** targetStart, const UTF16* targetEnd,
|
|
const pdc_convers_flags flags) {
|
|
pdc_convers_result result = conversionOK;
|
|
UTF32* source = *sourceStart;
|
|
UTF16* target = *targetStart;
|
|
while (source < sourceEnd) {
|
|
UTF32 ch;
|
|
if (target >= targetEnd) {
|
|
result = targetExhausted; break;
|
|
}
|
|
ch = *source++;
|
|
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
|
|
if ((flags == strictConversion) &&
|
|
(ch >= UNI_SUR_HIGH_START &&
|
|
ch <= UNI_SUR_LOW_END)) {
|
|
--source; /* return to the illegal value itself */
|
|
result = sourceIllegal;
|
|
break;
|
|
} else {
|
|
*target++ = (UTF16) ch; /* normal case */
|
|
}
|
|
} else if (ch > UNI_MAX_UTF16) {
|
|
if (flags == strictConversion) {
|
|
result = sourceIllegal;
|
|
} else {
|
|
*target++ = UNI_REPLACEMENT_CHAR;
|
|
}
|
|
} else {
|
|
/* target is a character in range 0xFFFF - 0x10FFFF. */
|
|
if (target + 1 >= targetEnd) {
|
|
result = targetExhausted;
|
|
break;
|
|
}
|
|
ch -= halfBase;
|
|
*target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
|
|
*target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
|
|
}
|
|
}
|
|
*sourceStart = source;
|
|
*targetStart = target;
|
|
return result;
|
|
}
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
static pdc_convers_result
|
|
pdc_convertUTF16toUTF32 (
|
|
UTF16** sourceStart, UTF16* sourceEnd,
|
|
UTF32** targetStart, const UTF32* targetEnd,
|
|
const pdc_convers_flags flags) {
|
|
pdc_convers_result result = conversionOK;
|
|
UTF16* source = *sourceStart;
|
|
UTF32* target = *targetStart;
|
|
UTF32 ch, ch2;
|
|
while (source < sourceEnd) {
|
|
ch = *source++;
|
|
if (ch >= UNI_SUR_HIGH_START &&
|
|
ch <= UNI_SUR_HIGH_END &&
|
|
source < sourceEnd) {
|
|
ch2 = *source;
|
|
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
|
|
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
|
|
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
|
|
++source;
|
|
} else if (flags == strictConversion) {
|
|
/* it's an unpaired high surrogate */
|
|
--source; /* return to the illegal value itself */
|
|
result = sourceIllegal;
|
|
break;
|
|
}
|
|
} else if ((flags == strictConversion) &&
|
|
(ch >= UNI_SUR_LOW_START &&
|
|
ch <= UNI_SUR_LOW_END)) {
|
|
/* an unpaired low surrogate */
|
|
--source; /* return to the illegal value itself */
|
|
result = sourceIllegal;
|
|
break;
|
|
}
|
|
if (target >= targetEnd) {
|
|
result = targetExhausted;
|
|
break;
|
|
}
|
|
*target++ = ch;
|
|
}
|
|
*sourceStart = source;
|
|
*targetStart = target;
|
|
#ifdef CVTUTF_DEBUG
|
|
if (result == sourceIllegal) {
|
|
fprintf(stderr, "pdc_convertUTF16toUTF32 illegal seq 0x%04x,%04x\n",
|
|
ch, ch2);
|
|
fflush(stderr);
|
|
}
|
|
#endif
|
|
return result;
|
|
}
|
|
#endif
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
/*
|
|
* Index into the table below with the first byte of a UTF-8 sequence to
|
|
* get the number of trailing bytes that are supposed to follow it.
|
|
*/
|
|
static const char trailingBytesForUTF8[256] = {
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
|
};
|
|
|
|
#if 0
|
|
static const char
|
|
pdc_get_trailingBytesForUTF8(int i) {
|
|
return (trailingBytesForUTF8[i]);
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Magic values subtracted from a buffer value during UTF8 conversion.
|
|
* This table contains as many values as there might be trailing bytes
|
|
* in a UTF-8 sequence.
|
|
*/
|
|
static const UTF32 offsetsFromUTF8[6] = {
|
|
0x00000000UL, 0x00003080UL, 0x000E2080UL,
|
|
0x03C82080UL, 0xFA082080UL, 0x82082080UL
|
|
};
|
|
|
|
/*
|
|
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
|
|
* into the first byte, depending on how many bytes follow. There are
|
|
* as many entries in this table as there are UTF-8 sequence types.
|
|
* (I.e., one byte sequence, two byte... six byte sequence.)
|
|
*/
|
|
static const UTF8 firstByteMark[7] = {
|
|
0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
|
|
};
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
/* The interface converts a whole buffer to avoid function-call overhead.
|
|
* Constants have been gathered. Loops & conditionals have been removed as
|
|
* much as possible for efficiency, in favor of drop-through switches.
|
|
* (See "Note A" at the bottom of the file for equivalent code.)
|
|
* If your compiler supports it, the "pdc_islegalUTF8" call can be turned
|
|
* into an inline function.
|
|
*/
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
static pdc_convers_result
|
|
pdc_convertUTF16toUTF8 (
|
|
UTF16** sourceStart, const UTF16* sourceEnd,
|
|
UTF8** targetStart, const UTF8* targetEnd,
|
|
const pdc_convers_flags flags) {
|
|
pdc_convers_result result = conversionOK;
|
|
UTF16* source = *sourceStart;
|
|
UTF8* target = *targetStart;
|
|
while (source < sourceEnd) {
|
|
UTF32 ch;
|
|
unsigned short bytesToWrite = 0;
|
|
const UTF32 byteMask = 0xBF;
|
|
const UTF32 byteMark = 0x80;
|
|
ch = *source++;
|
|
/* If we have a surrogate pair, convert to UTF32 first. */
|
|
if (ch >= UNI_SUR_HIGH_START &&
|
|
ch <= UNI_SUR_HIGH_END &&
|
|
source < sourceEnd) {
|
|
UTF32 ch2 = *source;
|
|
if (ch2 >= UNI_SUR_LOW_START &&
|
|
ch2 <= UNI_SUR_LOW_END) {
|
|
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
|
|
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
|
|
++source;
|
|
} else if (flags == strictConversion) {
|
|
/* it's an unpaired high surrogate */
|
|
--source; /* return to the illegal value itself */
|
|
result = sourceIllegal;
|
|
break;
|
|
}
|
|
} else if ((flags == strictConversion) &&
|
|
(ch >= UNI_SUR_LOW_START &&
|
|
ch <= UNI_SUR_LOW_END)) {
|
|
--source; /* return to the illegal value itself */
|
|
result = sourceIllegal;
|
|
break;
|
|
}
|
|
/* Figure out how many bytes the result will require */
|
|
if (ch < (UTF32)0x80) { bytesToWrite = 1;
|
|
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
|
|
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
|
|
} else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
|
|
} else { bytesToWrite = 2;
|
|
ch = UNI_REPLACEMENT_CHAR;
|
|
}
|
|
|
|
target += bytesToWrite;
|
|
if (target > targetEnd) {
|
|
target -= bytesToWrite; result = targetExhausted; break;
|
|
}
|
|
switch (bytesToWrite) { /* note: everything falls through. */
|
|
case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
|
|
}
|
|
target += bytesToWrite;
|
|
}
|
|
*sourceStart = source;
|
|
*targetStart = target;
|
|
return result;
|
|
}
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
/*
|
|
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
|
|
* This must be called with the length pre-determined by the first byte.
|
|
* If not calling this from pdc_convertUTF8to*, then the length can be set by:
|
|
* length = trailingBytesForUTF8[*source]+1;
|
|
* and the sequence is illegal right away if there aren't that many bytes
|
|
* available.
|
|
* If presented with a length > 4, this returns pdc_false. The Unicode
|
|
* definition of UTF-8 goes up to 4-byte sequences.
|
|
*/
|
|
|
|
static pdc_bool
|
|
pdc_islegalUTF8(UTF8 *source, int length) {
|
|
UTF8 a;
|
|
UTF8 *srcptr = source+length;
|
|
switch (length) {
|
|
default: return pdc_false;
|
|
/* Everything else falls through when "pdc_true"... */
|
|
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
|
|
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
|
|
case 2: if ((a = (*--srcptr)) > 0xBF) return pdc_false;
|
|
switch (*source) {
|
|
/* no fall-through in this inner switch */
|
|
case 0xE0: if (a < 0xA0) return pdc_false; break;
|
|
case 0xF0: if (a < 0x90) return pdc_false; break;
|
|
case 0xF4: if (a > 0x8F) return pdc_false; break;
|
|
default: if (a < 0x80) return pdc_false;
|
|
}
|
|
case 1: if (*source >= 0x80 && *source < 0xC2) return pdc_false;
|
|
if (*source > 0xF4) return pdc_false;
|
|
}
|
|
return pdc_true;
|
|
}
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
/*
|
|
* Exported function to return whether a UTF-8 sequence is legal or not.
|
|
* This is not used here; it's just exported.
|
|
*/
|
|
#if 0
|
|
static pdc_bool pdc_islegalUTF8sequence(UTF8 *source, UTF8 *sourceEnd) {
|
|
int length = trailingBytesForUTF8[*source]+1;
|
|
if (source+length > sourceEnd) {
|
|
return pdc_false;
|
|
}
|
|
return pdc_islegalUTF8(source, length);
|
|
}
|
|
#endif
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
static pdc_convers_result
|
|
pdc_convertUTF8toUTF16 (
|
|
UTF8** sourceStart, UTF8* sourceEnd,
|
|
UTF16** targetStart, const UTF16* targetEnd,
|
|
const pdc_convers_flags flags) {
|
|
pdc_convers_result result = conversionOK;
|
|
UTF8* source = *sourceStart;
|
|
UTF16* target = *targetStart;
|
|
while (source < sourceEnd) {
|
|
UTF32 ch = 0L;
|
|
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
|
|
if (source + extraBytesToRead >= sourceEnd) {
|
|
result = sourceExhausted;
|
|
break;
|
|
}
|
|
/* Do this check whether lenient or strict */
|
|
if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
|
|
result = sourceIllegal;
|
|
break;
|
|
}
|
|
/*
|
|
* The cases all fall through. See "Note A" below.
|
|
*/
|
|
switch (extraBytesToRead) {
|
|
case 3: ch += *source++; ch <<= 6;
|
|
case 2: ch += *source++; ch <<= 6;
|
|
case 1: ch += *source++; ch <<= 6;
|
|
case 0: ch += *source++;
|
|
}
|
|
ch -= offsetsFromUTF8[extraBytesToRead];
|
|
|
|
if (target >= targetEnd) {
|
|
result = targetExhausted;
|
|
break;
|
|
}
|
|
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
|
|
if ((flags == strictConversion) &&
|
|
(ch >= UNI_SUR_HIGH_START &&
|
|
ch <= UNI_SUR_LOW_END)) {
|
|
--source; /* return to the illegal value itself */
|
|
result = sourceIllegal;
|
|
break;
|
|
} else {
|
|
*target++ = (UTF16) ch; /* normal case */
|
|
}
|
|
} else if (ch > UNI_MAX_UTF16) {
|
|
if (flags == strictConversion) {
|
|
result = sourceIllegal;
|
|
source -= extraBytesToRead; /* return to the start */
|
|
} else {
|
|
*target++ = UNI_REPLACEMENT_CHAR;
|
|
}
|
|
} else {
|
|
/* target is a character in range 0xFFFF - 0x10FFFF. */
|
|
if (target + 1 >= targetEnd) {
|
|
result = targetExhausted;
|
|
break;
|
|
}
|
|
ch -= halfBase;
|
|
*target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
|
|
*target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
|
|
}
|
|
}
|
|
*sourceStart = source;
|
|
*targetStart = target;
|
|
return result;
|
|
}
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
#if 0
|
|
static pdc_convers_result
|
|
pdc_convertUTF32toUTF8 (
|
|
UTF32** sourceStart, const UTF32* sourceEnd,
|
|
UTF8** targetStart, const UTF8* targetEnd,
|
|
const pdc_convers_flags flags) {
|
|
pdc_convers_result result = conversionOK;
|
|
UTF32* source = *sourceStart;
|
|
UTF8* target = *targetStart;
|
|
while (source < sourceEnd) {
|
|
UTF32 ch;
|
|
unsigned short bytesToWrite = 0;
|
|
const UTF32 byteMask = 0x000000BF;
|
|
const UTF32 byteMark = 0x00000080;
|
|
ch = *source++;
|
|
/* surrogates of any stripe are not legal UTF32 characters */
|
|
if (flags == strictConversion ) {
|
|
if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
|
|
--source; /* return to the illegal value itself */
|
|
result = sourceIllegal;
|
|
break;
|
|
}
|
|
}
|
|
/* Figure out how many bytes the result will require */
|
|
if (ch < (UTF32)0x80) { bytesToWrite = 1;
|
|
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
|
|
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
|
|
} else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
|
|
} else { bytesToWrite = 2;
|
|
ch = UNI_REPLACEMENT_CHAR;
|
|
}
|
|
|
|
target += bytesToWrite;
|
|
if (target > targetEnd) {
|
|
target -= bytesToWrite; result = targetExhausted; break;
|
|
}
|
|
switch (bytesToWrite) { /* note: everything falls through. */
|
|
case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
|
|
}
|
|
target += bytesToWrite;
|
|
}
|
|
*sourceStart = source;
|
|
*targetStart = target;
|
|
return result;
|
|
}
|
|
|
|
/* --------------------------------------------------------------------- */
|
|
|
|
static pdc_convers_result
|
|
pdc_convertUTF8toUTF32 (
|
|
UTF8** sourceStart, UTF8* sourceEnd,
|
|
UTF32** targetStart, const UTF32* targetEnd,
|
|
const pdc_convers_flags flags) {
|
|
pdc_convers_result result = conversionOK;
|
|
UTF8* source = *sourceStart;
|
|
UTF32* target = *targetStart;
|
|
|
|
(void) flags;
|
|
|
|
while (source < sourceEnd) {
|
|
UTF32 ch = 0;
|
|
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
|
|
if (source + extraBytesToRead >= sourceEnd) {
|
|
result = sourceExhausted; break;
|
|
}
|
|
/* Do this check whether lenient or strict */
|
|
if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
|
|
result = sourceIllegal;
|
|
break;
|
|
}
|
|
/*
|
|
* The cases all fall through. See "Note A" below.
|
|
*/
|
|
switch (extraBytesToRead) {
|
|
case 3: ch += *source++; ch <<= 6;
|
|
case 2: ch += *source++; ch <<= 6;
|
|
case 1: ch += *source++; ch <<= 6;
|
|
case 0: ch += *source++;
|
|
}
|
|
ch -= offsetsFromUTF8[extraBytesToRead];
|
|
|
|
if (target >= targetEnd) {
|
|
result = targetExhausted;
|
|
break;
|
|
}
|
|
if (ch <= UNI_MAX_UTF32) {
|
|
*target++ = ch;
|
|
} else if (ch > UNI_MAX_UTF32) {
|
|
*target++ = UNI_REPLACEMENT_CHAR;
|
|
} else {
|
|
if (target + 1 >= targetEnd) {
|
|
result = targetExhausted;
|
|
break;
|
|
}
|
|
ch -= halfBase;
|
|
*target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
|
|
*target++ = (ch & halfMask) + UNI_SUR_LOW_START;
|
|
}
|
|
}
|
|
*sourceStart = source;
|
|
*targetStart = target;
|
|
return result;
|
|
}
|
|
#endif
|
|
|
|
/* ---------------------------------------------------------------------
|
|
|
|
Note A.
|
|
The fall-through switches in UTF-8 reading code save a
|
|
temp variable, some decrements & conditionals. The switches
|
|
are equivalent to the following loop:
|
|
{
|
|
int tmpBytesToRead = extraBytesToRead+1;
|
|
do {
|
|
ch += *source++;
|
|
--tmpBytesToRead;
|
|
if (tmpBytesToRead) ch <<= 6;
|
|
} while (tmpBytesToRead > 0);
|
|
}
|
|
In UTF-8 writing code, the switches on "bytesToWrite" are
|
|
similarly unrolled loops.
|
|
|
|
--------------------------------------------------------------------- */
|
|
|
|
/*
|
|
* pdc_convert_string converts a arbitrary encoded string (maybe UTF) to
|
|
* another encoded string.
|
|
*
|
|
* The new converted string is allocated and terminated by the required
|
|
* number of zeros.
|
|
*
|
|
* The caller is responsible for freeing the resulting string buffer.
|
|
*
|
|
*
|
|
* LBP: low byte picking
|
|
*
|
|
* Input-Parameter:
|
|
*
|
|
* inutf: input string format (see pc_unicode.h):
|
|
*
|
|
* pdc_auto: If codepage != 0:
|
|
* see above.
|
|
* Otherwise:
|
|
* If a BOM is recognized:
|
|
* pdc_utf8 or pdc_utf16xx resp.
|
|
* Otherwise if input encoding <inev> is specified:
|
|
* pdc_bytes
|
|
* Otherwise:
|
|
* pdc_utf16
|
|
*
|
|
* pdc_auto2: If input encoding is not specified:
|
|
* pdc_utf16
|
|
* Otherwise after successfull LBP:
|
|
* pdc_auto
|
|
* Otherwise:
|
|
* pdc_utf16
|
|
*
|
|
* pdc_bytes: 8-bit string. Encoding is <inev> if specified.
|
|
*
|
|
* pdc_bytes2: After successfull LBP:
|
|
* pdc_bytes
|
|
* Otherwise:
|
|
* pdc_utf16
|
|
*
|
|
* pdc_utf8: UTF-8 formatted string.
|
|
*
|
|
* pdc_ebcdicutf8: EBCDIC-UTF-8 formatted string.
|
|
*
|
|
* pdc_utf16: If a UTF16 BOM is recognized:
|
|
* pdc_utf16be or pdc_utf16le
|
|
* Otherwise UTF-16 machine byte ordered string.
|
|
*
|
|
* pdc_utf16be UTF-16 big endian formatted string.
|
|
*
|
|
* pdc_utf16le UTF-16 little endian formatted string.
|
|
*
|
|
* codepage: OEM multi byte code-page number. If > 0 and
|
|
* inutf = pdc_auto, text will be converted to UTF-16.
|
|
*
|
|
* inev: Encoding vector for input pdc_bytes string.
|
|
*
|
|
* instring: Input string.
|
|
*
|
|
* inlen: Length of input string in byte.
|
|
*
|
|
* oututf: Target format for output string.
|
|
* pdc_auto, pdc_auto2 and pdc_bytes2 are not supported.
|
|
*
|
|
* outev: Encoding vector for output pdc_bytes string.
|
|
*
|
|
* flags: PDC_CONV_KEEPBYTES:
|
|
* Input pdc_bytes strings will be kept differing from oututf.
|
|
* *oututf: pdc_byte.
|
|
*
|
|
* PDC_CONV_TRY7BYTES:
|
|
* UTF-8 output strings will have no BOM if every byte
|
|
* is smaller than x80.
|
|
* *oututf: pdc_byte.
|
|
*
|
|
* PDC_CONV_TRYBYTES:
|
|
* UTF-UTF-16xx output strings will be converted by LBP
|
|
* if each character is smaller than x0100.
|
|
* *oututf: pdc_byte.
|
|
*
|
|
* PDC_CONV_WITHBOM:
|
|
* UTF-8 or UTF-UTF-16xx output strings will be armed
|
|
* with an appropriate BOM.
|
|
*
|
|
* PDC_CONV_NOBOM:
|
|
* In UTF-8 or UTF-UTF-16xx output strings any BOM sequence
|
|
* will be removed.
|
|
*
|
|
* PDC_CONV_ANALYZE:
|
|
* Only analyzing BOMs of input string and dissolving auto
|
|
* textformats.
|
|
*
|
|
* PDC_CONV_TMPALLOC
|
|
* Temporary memory functions (pdc_malloc_tmp) are used
|
|
* rather than pdc_malloc etc.
|
|
*
|
|
* PDC_CONV_HTMLCHAR
|
|
* If input encoding vector is specified HTML character
|
|
* entities will be substituted.
|
|
*
|
|
* PDC_CONV_NEWALLOC
|
|
* Input string must be allocated at first to guarantee
|
|
* pointer alignment.
|
|
*
|
|
*
|
|
* verbose: Error messages are put out. Otherwise they are saved only.
|
|
*
|
|
* Output-Parameter:
|
|
*
|
|
* oututf: Reached format for output string.
|
|
*
|
|
* outstring: Pointer of allocated output string
|
|
*
|
|
* outlen: Length of output string.
|
|
*
|
|
*/
|
|
|
|
#if defined(_MSC_VER) && defined(_MANAGED)
|
|
#pragma unmanaged
|
|
#endif
|
|
int
|
|
pdc_convert_string(pdc_core *pdc,
|
|
pdc_text_format inutf, int codepage,
|
|
pdc_encodingvector *inev,
|
|
pdc_byte *instring, int inlen,
|
|
pdc_text_format *oututf_p, pdc_encodingvector *outev,
|
|
pdc_byte **outstring, int *outlen, int flags,
|
|
pdc_bool verbose)
|
|
{
|
|
static const char *fn = "pdc_convert_string";
|
|
const char *stemp = NULL;
|
|
pdc_text_format oututf = *oututf_p;
|
|
pdc_text_format oututf_s;
|
|
pdc_ushort *usinstr = (pdc_ushort *) instring;
|
|
pdc_ushort uv = 0, uvr = 0;
|
|
pdc_byte *instr = (pdc_byte *) instring;
|
|
pdc_bool inalloc = pdc_false;
|
|
pdc_bool hasbom = pdc_false;
|
|
pdc_bool toswap = pdc_false;
|
|
int errcode = 0;
|
|
int i, j, n, len = 0;
|
|
|
|
/* prophylactic */
|
|
if (!inlen)
|
|
{
|
|
instring = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, 2, fn, NULL, NULL) :
|
|
pdc_calloc(pdc, 2, fn));
|
|
|
|
inalloc = pdc_true;
|
|
}
|
|
else if (flags & PDC_CONV_NEWALLOC)
|
|
{
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_malloc_tmp(pdc, (size_t) inlen, fn, NULL, NULL) :
|
|
pdc_malloc(pdc, (size_t) inlen, fn));
|
|
memcpy(instr, instring, (size_t) inlen);
|
|
|
|
inalloc = pdc_true;
|
|
instring = instr;
|
|
usinstr = (pdc_ushort *) instring;
|
|
}
|
|
|
|
|
|
switch(inutf)
|
|
{
|
|
/* analyzing 2 byte textformat */
|
|
case pdc_auto2:
|
|
case pdc_bytes2:
|
|
if ((inutf == pdc_auto2 && !inev) || (flags & PDC_CONV_ANALYZE))
|
|
{
|
|
inutf = pdc_utf16;
|
|
}
|
|
else
|
|
{
|
|
len = inlen / 2;
|
|
if (2 * len != inlen)
|
|
{
|
|
errcode = PDC_E_CONV_ILLUTF16;
|
|
goto PDC_CONV_ERROR;
|
|
}
|
|
for (i = 0; i < len; i++)
|
|
if (usinstr[i] > 0x00FF)
|
|
break;
|
|
|
|
/* low byte picking */
|
|
if (i == len)
|
|
{
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) (len + 2), fn));
|
|
for (i = 0; i < len; i++)
|
|
instr[i] = (pdc_byte) usinstr[i];
|
|
|
|
if (inalloc)
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instring);
|
|
else
|
|
pdc_free(pdc, instring);
|
|
}
|
|
|
|
inalloc = pdc_true;
|
|
instring = instr;
|
|
inlen = len;
|
|
|
|
if (inutf == pdc_bytes2)
|
|
inutf = pdc_bytes;
|
|
else
|
|
inutf = pdc_auto;
|
|
}
|
|
else
|
|
{
|
|
inutf = pdc_utf16;
|
|
}
|
|
}
|
|
break;
|
|
|
|
/* OEM multi byte text strings */
|
|
case pdc_auto:
|
|
if (codepage > 0)
|
|
{
|
|
#if defined(WIN32)
|
|
if (!(flags & PDC_CONV_ANALYZE))
|
|
{
|
|
len = MultiByteToWideChar((UINT) codepage, (DWORD) 0,
|
|
(LPCSTR) instring, inlen, NULL, 0);
|
|
if (len == 0)
|
|
{
|
|
DWORD lasterror = GetLastError();
|
|
|
|
if (lasterror == ERROR_INVALID_PARAMETER)
|
|
{
|
|
errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM;
|
|
}
|
|
else
|
|
{
|
|
stemp = pdc_errprintf(pdc, "cp%d", codepage);
|
|
errcode = PDC_E_CONV_ILL_MBTEXTSTRING;
|
|
}
|
|
goto PDC_CONV_ERROR;
|
|
}
|
|
|
|
len *= 2;
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn,
|
|
NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) (len + 2), fn));
|
|
MultiByteToWideChar((UINT) codepage, (DWORD) 0, (LPCSTR)
|
|
instring, inlen,
|
|
(LPWSTR) instr, len);
|
|
|
|
if (inalloc)
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instring);
|
|
else
|
|
pdc_free(pdc, instring);
|
|
}
|
|
|
|
inalloc = pdc_true;
|
|
instring = instr;
|
|
inlen = len;
|
|
}
|
|
inutf = pdc_utf16;
|
|
#else /* WIN32 */
|
|
errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM;
|
|
goto PDC_CONV_ERROR;
|
|
#endif /* !WIN32 */
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/* analyzing UTF-16 textformat */
|
|
if (inutf == pdc_utf16)
|
|
{
|
|
if (pdc_is_utf16be_unicode(instring))
|
|
inutf = pdc_utf16be;
|
|
else if (pdc_is_utf16le_unicode(instring))
|
|
inutf = pdc_utf16le;
|
|
}
|
|
|
|
/* analyzing auto textformat */
|
|
else if (inutf == pdc_auto)
|
|
{
|
|
if (pdc_is_utf8_bytecode(instring))
|
|
inutf = PDC_UTF8;
|
|
else if (pdc_is_utf16be_unicode(instring))
|
|
inutf = pdc_utf16be;
|
|
else if (pdc_is_utf16le_unicode(instring))
|
|
inutf = pdc_utf16le;
|
|
else if (inev)
|
|
inutf = pdc_bytes;
|
|
else
|
|
inutf = pdc_utf16;
|
|
}
|
|
|
|
/* only analyzing */
|
|
if (flags & PDC_CONV_ANALYZE)
|
|
goto PDC_CONV_EXIT;
|
|
|
|
/* conversion to UTF-16 by swapping */
|
|
if ((inutf == pdc_utf16be || inutf == pdc_utf16le) &&
|
|
(inutf != oututf || flags & PDC_CONV_TRYBYTES ||
|
|
flags & PDC_CONV_HTMLCHAR))
|
|
{
|
|
if (inlen &&
|
|
((inutf == pdc_utf16be && !PDC_ISBIGENDIAN) ||
|
|
(inutf == pdc_utf16le && PDC_ISBIGENDIAN)))
|
|
{
|
|
if (inalloc)
|
|
pdc_swap_bytes((char *) instring, inlen, NULL);
|
|
else
|
|
{
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) (inlen + 2), fn));
|
|
pdc_swap_bytes((char *) instring, inlen, (char *) instr);
|
|
|
|
inalloc = pdc_true;
|
|
instring = instr;
|
|
}
|
|
}
|
|
inutf = pdc_utf16;
|
|
}
|
|
|
|
/* illegal UTF-16 */
|
|
if (inutf >= pdc_utf16 && inlen % 2)
|
|
{
|
|
errcode = PDC_E_CONV_ILLUTF16;
|
|
goto PDC_CONV_ERROR;
|
|
}
|
|
|
|
|
|
/* conversion to UTF-16 by inflation or encoding vector */
|
|
if (inutf == pdc_bytes)
|
|
{
|
|
if ((oututf != pdc_bytes && !(flags & PDC_CONV_KEEPBYTES)) ||
|
|
flags & PDC_CONV_HTMLCHAR || inev != NULL || outev != NULL)
|
|
{
|
|
len = 2 * inlen;
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) (len + 2), fn));
|
|
usinstr = (pdc_ushort *) instr;
|
|
|
|
j = 0;
|
|
for (i = 0; i < inlen; i++)
|
|
{
|
|
uv = (pdc_ushort) instring[i];
|
|
if (inev)
|
|
uv = inev->codes[uv];
|
|
if (!uv)
|
|
uv = PDC_UNICODE_SPACE;
|
|
usinstr[j] = uv;
|
|
j++;
|
|
}
|
|
|
|
if (inalloc)
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instring);
|
|
else
|
|
pdc_free(pdc, instring);
|
|
}
|
|
|
|
inalloc = pdc_true;
|
|
instring = instr;
|
|
inlen = 2 * j;
|
|
inutf = pdc_utf16;
|
|
}
|
|
else if (flags & PDC_CONV_KEEPBYTES)
|
|
{
|
|
oututf = pdc_bytes;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/* UTF conversion */
|
|
oututf_s = oututf;
|
|
if ((oututf_s == pdc_bytes && inutf == pdc_utf8) ||
|
|
oututf_s == pdc_utf16be || oututf_s == pdc_utf16le)
|
|
oututf_s = pdc_utf16;
|
|
if (inutf != oututf_s && oututf_s != pdc_bytes)
|
|
{
|
|
len = 4 * inlen + 2;
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) len, fn, NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) len, fn));
|
|
|
|
if (inlen)
|
|
{
|
|
pdc_convers_result result;
|
|
pdc_byte *instringa, *instra, *instringe, *instre;
|
|
UTF8 *isa8, *ise8;
|
|
UTF16 *isa16, *ise16;
|
|
|
|
instringa = instring;
|
|
instringe = instring + inlen;
|
|
instra = instr;
|
|
instre = instr + len;
|
|
|
|
if (inutf == pdc_utf8)
|
|
{
|
|
isa8 = (UTF8 *) instringa;
|
|
ise8 = (UTF8 *) instringe;
|
|
isa16 = (UTF16 *) instra;
|
|
ise16 = (UTF16 *) instre;
|
|
result = pdc_convertUTF8toUTF16(&isa8, ise8, &isa16, ise16,
|
|
strictConversion);
|
|
instra = (pdc_byte *) isa16;
|
|
instre = (pdc_byte *) ise16;
|
|
}
|
|
else
|
|
{
|
|
isa16 = (UTF16 *) instringa;
|
|
ise16 = (UTF16 *) instringe;
|
|
isa8 = (UTF8 *) instra;
|
|
ise8 = (UTF8 *) instre;
|
|
result = pdc_convertUTF16toUTF8(&isa16, ise16, &isa8, ise8,
|
|
strictConversion);
|
|
instra = (pdc_byte *) isa8;
|
|
instre = (pdc_byte *) ise8;
|
|
}
|
|
switch (result)
|
|
{
|
|
case targetExhausted:
|
|
errcode = PDC_E_CONV_MEMOVERFLOW;
|
|
break;
|
|
|
|
case sourceExhausted:
|
|
case sourceIllegal:
|
|
errcode = PDC_E_CONV_ILLUTF;
|
|
stemp = pdc_errprintf(pdc, "%d", inutf == pdc_utf8 ? 8 : 16);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (errcode)
|
|
{
|
|
instring = instr;
|
|
goto PDC_CONV_ERROR;
|
|
}
|
|
|
|
inlen = instra - instr;
|
|
}
|
|
|
|
if (inalloc)
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instring);
|
|
else
|
|
pdc_free(pdc, instring);
|
|
}
|
|
|
|
if (inlen + 2 != len)
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_realloc_tmp(pdc, instr, (size_t) (inlen + 2), fn) :
|
|
pdc_realloc(pdc, instr, (size_t) (inlen + 2), fn));
|
|
instr[inlen] = 0;
|
|
instr[inlen + 1] = 0;
|
|
|
|
inalloc = pdc_true;
|
|
instring = instr;
|
|
inutf = oututf_s;
|
|
}
|
|
|
|
if (inutf == pdc_bytes)
|
|
{
|
|
if (!inalloc)
|
|
{
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) (inlen + 2), fn));
|
|
memcpy(instr, instring, (size_t) inlen);
|
|
|
|
inalloc = pdc_true;
|
|
instring = instr;
|
|
}
|
|
}
|
|
|
|
/* trying to reduce UTF-16 string to bytes string */
|
|
if (inutf == pdc_utf16 && (oututf == pdc_bytes ||
|
|
flags & PDC_CONV_TRYBYTES || flags & PDC_CONV_KEEPBYTES))
|
|
{
|
|
if (pdc_is_utf16be_unicode(instring) ||
|
|
pdc_is_utf16le_unicode(instring))
|
|
n = 1;
|
|
else
|
|
n = 0;
|
|
|
|
len = (inlen - n) / 2;
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) (len + 2), fn));
|
|
usinstr = (pdc_ushort *) instring;
|
|
|
|
for (i = 0; i < len; i++)
|
|
{
|
|
uv = usinstr[i + n];
|
|
if (outev && uv)
|
|
uv = (pdc_ushort) pdc_get_encoding_bytecode(pdc, outev, uv);
|
|
if (uv > 0x00FF)
|
|
break;
|
|
|
|
instr[i] = (pdc_byte) uv;
|
|
}
|
|
|
|
if (i == len)
|
|
{
|
|
if (inalloc)
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instring);
|
|
else
|
|
pdc_free(pdc, instring);
|
|
}
|
|
|
|
inalloc = pdc_true;
|
|
instring = instr;
|
|
inlen = len;
|
|
inutf = pdc_bytes;
|
|
}
|
|
else
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instr);
|
|
else
|
|
pdc_free(pdc, instr);
|
|
}
|
|
}
|
|
|
|
/* UTF-8 format */
|
|
if (inutf == pdc_utf8)
|
|
{
|
|
hasbom = pdc_is_utf8_unicode(instring);
|
|
|
|
if (flags & PDC_CONV_TRY7BYTES)
|
|
{
|
|
for (i = hasbom ? 3 : 0; i < inlen; i++)
|
|
if (instring[i] > 0x7F)
|
|
break;
|
|
if (i == inlen)
|
|
{
|
|
flags &= ~PDC_CONV_WITHBOM;
|
|
flags |= PDC_CONV_NOBOM;
|
|
inutf = pdc_bytes;
|
|
}
|
|
}
|
|
|
|
if (!inalloc || flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
|
|
{
|
|
i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 3 : 0;
|
|
j = (flags & PDC_CONV_NOBOM && hasbom) ? 3 : 0;
|
|
|
|
len = inlen + i - j;
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) (len + 2), fn));
|
|
memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
|
|
instr[len] = 0;
|
|
|
|
if (inalloc)
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instring);
|
|
else
|
|
pdc_free(pdc, instring);
|
|
}
|
|
|
|
instring = instr;
|
|
inlen = len;
|
|
|
|
hasbom = (flags & PDC_CONV_WITHBOM);
|
|
}
|
|
|
|
if (hasbom)
|
|
{
|
|
instring[0] = PDF_BOM2;
|
|
instring[1] = PDF_BOM3;
|
|
instring[2] = PDF_BOM4;
|
|
}
|
|
|
|
}
|
|
|
|
/* UTF-16 formats */
|
|
if (inutf == pdc_utf16 || inutf == pdc_utf16be || inutf == pdc_utf16le)
|
|
{
|
|
hasbom = pdc_is_utf16be_unicode(instring) ||
|
|
pdc_is_utf16le_unicode(instring);
|
|
|
|
if (!inalloc || oututf == pdc_utf16be || oututf == pdc_utf16le ||
|
|
flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
|
|
{
|
|
i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 2 : 0;
|
|
j = (flags & PDC_CONV_NOBOM && hasbom) ? 2 : 0;
|
|
|
|
len = inlen + i - j;
|
|
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
|
|
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
|
|
pdc_calloc(pdc, (size_t) (len + 2), fn));
|
|
memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
|
|
|
|
if (inalloc)
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instring);
|
|
else
|
|
pdc_free(pdc, instring);
|
|
}
|
|
|
|
instring = instr;
|
|
inlen = len;
|
|
|
|
hasbom = (flags & PDC_CONV_WITHBOM);
|
|
}
|
|
|
|
i = hasbom ? 2 : 0;
|
|
if (inutf == pdc_utf16)
|
|
{
|
|
if (oututf == pdc_utf16be)
|
|
{
|
|
inutf = pdc_utf16be;
|
|
toswap = !PDC_ISBIGENDIAN;
|
|
}
|
|
if (oututf == pdc_utf16le)
|
|
{
|
|
inutf = pdc_utf16le;
|
|
toswap = PDC_ISBIGENDIAN;
|
|
}
|
|
if (toswap)
|
|
pdc_swap_bytes((char *) &instring[i], inlen - i, NULL);
|
|
}
|
|
|
|
if (hasbom)
|
|
{
|
|
if (inutf == pdc_utf16be ||
|
|
(inutf == pdc_utf16 && PDC_ISBIGENDIAN))
|
|
{
|
|
instring[0] = PDF_BOM0;
|
|
instring[1] = PDF_BOM1;
|
|
}
|
|
if (inutf == pdc_utf16le ||
|
|
(inutf == pdc_utf16 && !PDC_ISBIGENDIAN))
|
|
{
|
|
instring[0] = PDF_BOM1;
|
|
instring[1] = PDF_BOM0;
|
|
}
|
|
}
|
|
}
|
|
|
|
PDC_CONV_EXIT:
|
|
*oututf_p = inutf;
|
|
if (outlen)
|
|
*outlen = inlen;
|
|
*outstring = instring;
|
|
return 0;
|
|
|
|
PDC_CONV_ERROR:
|
|
if (outlen)
|
|
*outlen = 0;
|
|
*outstring = NULL;
|
|
|
|
pdc_set_errmsg(pdc, errcode, stemp, 0, 0, 0);
|
|
|
|
if (inalloc)
|
|
{
|
|
if (flags & PDC_CONV_TMPALLOC)
|
|
pdc_free_tmp(pdc, instring);
|
|
else
|
|
pdc_free(pdc, instring);
|
|
}
|
|
|
|
if (verbose)
|
|
pdc_error(pdc, -1, 0, 0, 0, 0);
|
|
|
|
return errcode;
|
|
}
|
|
#if defined(_MSC_VER) && defined(_MANAGED)
|
|
#pragma managed
|
|
#endif
|
|
|
|
|
|
|
|
|