campo-sirio/pdf/pdcore/pc_unicode.c
guy 3bf951f42c Patch level : 10.0
Files correlati     : pdflib
Ricompilazione Demo : [ ]
Commento            :
Aggiornata pdflib.dll alla versione 7.0.4


git-svn-id: svn://10.65.10.50/trunk@18580 c028cbd2-c16b-5b4b-a496-9718f37d4682
2009-03-23 08:55:58 +00:00

2007 lines
64 KiB
C
Executable File

/*---------------------------------------------------------------------------*
| PDFlib - A library for generating PDF on the fly |
+---------------------------------------------------------------------------+
| Copyright (c) 1997-2006 Thomas Merz and PDFlib GmbH. All rights reserved. |
+---------------------------------------------------------------------------+
| |
| This software is subject to the PDFlib license. It is NOT in the |
| public domain. Extended versions and commercial licenses are |
| available, please check http://www.pdflib.com. |
| |
*---------------------------------------------------------------------------*/
/* $Id: pc_unicode.c,v 1.4 2009-03-23 08:51:17 guy Exp $
*
* PDFlib Unicode converting routines
*
*/
#define PC_UNICODE_C
#include "pc_util.h"
#if defined(WIN32)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif /* WIN32 */
/*
* The following source is based on Unicode's original source
* code ConvertUTF.c. It has been adapted to PDFlib programming
* conventions.
*
* The original file had the following notice:
*
* Copyright 2001 Unicode, Inc.
*
* Limitations on Rights to Redistribute This Code
*
* Author: Mark E. Davis, 1994.
* Rev History: Rick McGowan, fixes & updates May 2001.
*
*
* Functions for conversions between UTF32, UTF-16, and UTF-8.
* These funtions forming a complete set of conversions between
* the three formats. UTF-7 is not included here.
*
* Each of these routines takes pointers to input buffers and output
* buffers. The input buffers are const.
*
* Each routine converts the text between *sourceStart and sourceEnd,
* putting the result into the buffer between *targetStart and
* targetEnd. Note: the end pointers are *after* the last item: e.g.
* *(sourceEnd - 1) is the last item.
*
* The return result indicates whether the conversion was successful,
* and if not, whether the problem was in the source or target buffers.
* (Only the first encountered problem is indicated.)
*
* After the conversion, *sourceStart and *targetStart are both
* updated to point to the end of last text successfully converted in
* the respective buffers.
*
* Input parameters:
* sourceStart - pointer to a pointer to the source buffer.
* The contents of this are modified on return so that
* it points at the next thing to be converted.
* targetStart - similarly, pointer to pointer to the target buffer.
* sourceEnd, targetEnd - respectively pointers to the ends of the
* two buffers, for overflow checking only.
*
* These conversion functions take a pdc_convers_flags argument. When this
* flag is set to strict, both irregular sequences and isolated surrogates
* will cause an error. When the flag is set to lenient, both irregular
* sequences and isolated surrogates are converted.
*
* Whether the flag is strict or lenient, all illegal sequences will cause
* an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
* or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
* must check for illegal sequences.
*
* When the flag is set to lenient, characters over 0x10FFFF are converted
* to the replacement character; otherwise (when the flag is set to strict)
* they constitute an error.
*
* Output parameters:
* The value "sourceIllegal" is returned from some routines if the input
* sequence is malformed. When "sourceIllegal" is returned, the source
* value will point to the illegal value that caused the problem. E.g.,
* in UTF-8 when a sequence is malformed, it points to the start of the
* malformed sequence.
*
* Author: Mark E. Davis, 1994.
* Rev History: Rick McGowan, fixes & updates May 2001.
*
*/
/*
* The following 4 definitions are compiler-specific.
* The C standard does not guarantee that wchar_t has at least
* 16 bits, so wchar_t is no less portable than unsigned short!
* All should be unsigned values to avoid sign extension during
* bit mask & shift operations.
*/
/* Unicode original:
typedef unsigned long UTF32; at least 32 bits
typedef unsigned short UTF16; at least 16 bits
*/
typedef unsigned int UTF32; /* 32 bits */
typedef unsigned short UTF16; /* 16 bits */
typedef unsigned char UTF8; /* typically 8 bits */
/* Some fundamental constants */
#define UNI_SUR_HIGH_START (UTF32)0xD800
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
#define UNI_SUR_LOW_START (UTF32)0xDC00
#define UNI_SUR_LOW_END (UTF32)0xDFFF
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
static const int halfShift = 10; /* used for shifting by 10 bits */
static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;
/* --------------------------------------------------------------------- */
static pdc_convers_result
pdc_convertUTF32toUTF16 (
UTF32** sourceStart, const UTF32* sourceEnd,
UTF16** targetStart, const UTF16* targetEnd,
const pdc_convers_flags flags) {
pdc_convers_result result = conversionOK;
UTF32* source = *sourceStart;
UTF16* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
if (target >= targetEnd) {
result = targetExhausted; break;
}
ch = *source++;
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
if ((flags == strictConversion) &&
(ch >= UNI_SUR_HIGH_START &&
ch <= UNI_SUR_LOW_END)) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
} else {
*target++ = (UTF16) ch; /* normal case */
}
} else if (ch > UNI_MAX_UTF16) {
if (flags == strictConversion) {
result = sourceIllegal;
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
/* target is a character in range 0xFFFF - 0x10FFFF. */
if (target + 1 >= targetEnd) {
result = targetExhausted;
break;
}
ch -= halfBase;
*target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
*target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
}
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* --------------------------------------------------------------------- */
static pdc_convers_result
pdc_convertUTF16toUTF32 (
UTF16** sourceStart, UTF16* sourceEnd,
UTF32** targetStart, const UTF32* targetEnd,
const pdc_convers_flags flags) {
pdc_convers_result result = conversionOK;
UTF16* source = *sourceStart;
UTF32* target = *targetStart;
UTF32 ch, ch2;
while (source < sourceEnd) {
ch = *source++;
if (ch >= UNI_SUR_HIGH_START &&
ch <= UNI_SUR_HIGH_END &&
source < sourceEnd) {
ch2 = *source;
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
++source;
} else if (flags == strictConversion) {
/* it's an unpaired high surrogate */
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
} else if ((flags == strictConversion) &&
(ch >= UNI_SUR_LOW_START &&
ch <= UNI_SUR_LOW_END)) {
/* an unpaired low surrogate */
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
if (target >= targetEnd) {
result = targetExhausted;
break;
}
*target++ = ch;
}
*sourceStart = source;
*targetStart = target;
#ifdef CVTUTF_DEBUG
if (result == sourceIllegal) {
fprintf(stderr, "pdc_convertUTF16toUTF32 illegal seq 0x%04x,%04x\n",
ch, ch2);
fflush(stderr);
}
#endif
return result;
}
/* --------------------------------------------------------------------- */
/*
* Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
*/
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
#if 0
static const char
pdc_get_trailingBytesForUTF8(int i) {
return (trailingBytesForUTF8[i]);
}
#endif
/*
* Magic values subtracted from a buffer value during UTF8 conversion.
* This table contains as many values as there might be trailing bytes
* in a UTF-8 sequence.
*/
static const UTF32 offsetsFromUTF8[6] = {
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
/*
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
* into the first byte, depending on how many bytes follow. There are
* as many entries in this table as there are UTF-8 sequence types.
* (I.e., one byte sequence, two byte... six byte sequence.)
*/
static const UTF8 firstByteMark[7] = {
0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
};
/* --------------------------------------------------------------------- */
/* The interface converts a whole buffer to avoid function-call overhead.
* Constants have been gathered. Loops & conditionals have been removed as
* much as possible for efficiency, in favor of drop-through switches.
* (See "Note A" at the bottom of the file for equivalent code.)
* If your compiler supports it, the "pdc_islegalUTF8" call can be turned
* into an inline function.
*/
/* --------------------------------------------------------------------- */
static pdc_convers_result
pdc_convertUTF16toUTF8 (
UTF16** sourceStart, const UTF16* sourceEnd,
UTF8** targetStart, const UTF8* targetEnd,
const pdc_convers_flags flags) {
pdc_convers_result result = conversionOK;
UTF16* source = *sourceStart;
UTF8* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
unsigned short bytesToWrite = 0;
const UTF32 byteMask = 0xBF;
const UTF32 byteMark = 0x80;
ch = *source++;
/* If we have a surrogate pair, convert to UTF32 first. */
if (ch >= UNI_SUR_HIGH_START &&
ch <= UNI_SUR_HIGH_END &&
source < sourceEnd) {
UTF32 ch2 = *source;
if (ch2 >= UNI_SUR_LOW_START &&
ch2 <= UNI_SUR_LOW_END) {
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
++source;
} else if (flags == strictConversion) {
/* it's an unpaired high surrogate */
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
} else if ((flags == strictConversion) &&
(ch >= UNI_SUR_LOW_START &&
ch <= UNI_SUR_LOW_END)) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
/* Figure out how many bytes the result will require */
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
} else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
} else { bytesToWrite = 2;
ch = UNI_REPLACEMENT_CHAR;
}
target += bytesToWrite;
if (target > targetEnd) {
target -= bytesToWrite; result = targetExhausted; break;
}
switch (bytesToWrite) { /* note: everything falls through. */
case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
}
target += bytesToWrite;
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* --------------------------------------------------------------------- */
/*
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* If not calling this from pdc_convertUTF8to*, then the length can be set by:
* length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes
* available.
* If presented with a length > 4, this returns pdc_false. The Unicode
* definition of UTF-8 goes up to 4-byte sequences.
*/
static pdc_bool
pdc_islegalUTF8(UTF8 *source, int length) {
UTF8 a;
UTF8 *srcptr = source+length;
switch (length) {
default: return pdc_false;
/* Everything else falls through when "pdc_true"... */
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
case 2: if ((a = (*--srcptr)) > 0xBF) return pdc_false;
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return pdc_false; break;
case 0xF0: if (a < 0x90) return pdc_false; break;
case 0xF4: if (a > 0x8F) return pdc_false; break;
default: if (a < 0x80) return pdc_false;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return pdc_false;
if (*source > 0xF4) return pdc_false;
}
return pdc_true;
}
/* --------------------------------------------------------------------- */
/*
* Exported function to return whether a UTF-8 sequence is legal or not.
* This is not used here; it's just exported.
*/
#if 0
static pdc_bool pdc_islegalUTF8sequence(UTF8 *source, UTF8 *sourceEnd) {
int length = trailingBytesForUTF8[*source]+1;
if (source+length > sourceEnd) {
return pdc_false;
}
return pdc_islegalUTF8(source, length);
}
#endif
/* --------------------------------------------------------------------- */
static pdc_convers_result
pdc_convertUTF8toUTF16 (
UTF8** sourceStart, UTF8* sourceEnd,
UTF16** targetStart, const UTF16* targetEnd,
const pdc_convers_flags flags) {
pdc_convers_result result = conversionOK;
UTF8* source = *sourceStart;
UTF16* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch = 0L;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd) {
result = sourceExhausted;
break;
}
/* Do this check whether lenient or strict */
if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
result = sourceIllegal;
break;
}
/*
* The cases all fall through. See "Note A" below.
*/
switch (extraBytesToRead) {
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
case 0: ch += *source++;
}
ch -= offsetsFromUTF8[extraBytesToRead];
if (target >= targetEnd) {
result = targetExhausted;
break;
}
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
if ((flags == strictConversion) &&
(ch >= UNI_SUR_HIGH_START &&
ch <= UNI_SUR_LOW_END)) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
} else {
*target++ = (UTF16) ch; /* normal case */
}
} else if (ch > UNI_MAX_UTF16) {
if (flags == strictConversion) {
result = sourceIllegal;
source -= extraBytesToRead; /* return to the start */
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
/* target is a character in range 0xFFFF - 0x10FFFF. */
if (target + 1 >= targetEnd) {
result = targetExhausted;
break;
}
ch -= halfBase;
*target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
*target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
}
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* --------------------------------------------------------------------- */
static pdc_convers_result
pdc_convertUTF32toUTF8 (
UTF32** sourceStart, const UTF32* sourceEnd,
UTF8** targetStart, const UTF8* targetEnd,
const pdc_convers_flags flags) {
pdc_convers_result result = conversionOK;
UTF32* source = *sourceStart;
UTF8* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
unsigned short bytesToWrite = 0;
const UTF32 byteMask = 0x000000BF;
const UTF32 byteMark = 0x00000080;
ch = *source++;
/* surrogates of any stripe are not legal UTF32 characters */
if (flags == strictConversion ) {
if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
}
/* Figure out how many bytes the result will require */
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
} else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
} else { bytesToWrite = 2;
ch = UNI_REPLACEMENT_CHAR;
}
target += bytesToWrite;
if (target > targetEnd) {
target -= bytesToWrite; result = targetExhausted; break;
}
switch (bytesToWrite) { /* note: everything falls through. */
case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
}
target += bytesToWrite;
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* --------------------------------------------------------------------- */
static pdc_convers_result
pdc_convertUTF8toUTF32 (
UTF8** sourceStart, UTF8* sourceEnd,
UTF32** targetStart, const UTF32* targetEnd,
const pdc_convers_flags flags) {
pdc_convers_result result = conversionOK;
UTF8* source = *sourceStart;
UTF32* target = *targetStart;
(void) flags;
while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd) {
result = sourceExhausted; break;
}
/* Do this check whether lenient or strict */
if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
result = sourceIllegal;
break;
}
/*
* The cases all fall through. See "Note A" below.
*/
switch (extraBytesToRead) {
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
case 0: ch += *source++;
}
ch -= offsetsFromUTF8[extraBytesToRead];
if (target >= targetEnd) {
result = targetExhausted;
break;
}
if (ch <= UNI_MAX_UTF32) {
*target++ = ch;
} else if (ch > UNI_MAX_UTF32) {
*target++ = UNI_REPLACEMENT_CHAR;
} else {
if (target + 1 >= targetEnd) {
result = targetExhausted;
break;
}
ch -= halfBase;
*target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
*target++ = (ch & halfMask) + UNI_SUR_LOW_START;
}
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* ---------------------------------------------------------------------
Note A.
The fall-through switches in UTF-8 reading code save a
temp variable, some decrements & conditionals. The switches
are equivalent to the following loop:
{
int tmpBytesToRead = extraBytesToRead+1;
do {
ch += *source++;
--tmpBytesToRead;
if (tmpBytesToRead) ch <<= 6;
} while (tmpBytesToRead > 0);
}
In UTF-8 writing code, the switches on "bytesToWrite" are
similarly unrolled loops.
--------------------------------------------------------------------- */
const char *
pdc_get_textformat(int textformat)
{
return pdc_get_keyword(textformat, pdc_textformat_keylist);
}
static const pdc_keyconn pdc_utfformat_keylist[] =
{
{"8", pdc_utf8},
{"16", pdc_utf16},
{"32", pdc_utf32},
{NULL, 0}
};
/*
* pdc_convert_string converts a arbitrary encoded string (maybe UTF) to
* another encoded string.
*
* The new converted string is allocated and terminated by the required
* number of zeros.
*
* The caller is responsible for freeing the resulting string buffer.
*
*
* LBP: low byte picking
*
* Input-Parameter:
*
* inutf: input string format (see pc_unicode.h):
*
* pdc_auto: If codepage != 0:
* see above.
* Otherwise:
* If a BOM is recognized:
* pdc_utf8 or pdc_utf16xx resp.
* Otherwise if input encoding <inev> is specified
* and flag PDC_CONV_FORCEUTF16 not set:
* pdc_bytes
* Otherwise:
* pdc_utf16
*
* pdc_auto2: If input encoding is not specified:
* pdc_utf16
* Otherwise after successfull LBP:
* pdc_auto
* Otherwise:
* pdc_utf16
*
* pdc_bytes: 8-bit string. Encoding is <inev> if specified.
*
* pdc_bytes2: After successfull LBP:
* pdc_bytes
* Otherwise:
* pdc_utf16
*
* pdc_utf8: UTF-8 formatted string.
*
* pdc_ebcdicutf8: EBCDIC-UTF-8 formatted string.
*
* pdc_utf16: If a UTF16 BOM is recognized:
* pdc_utf16be or pdc_utf16le
* Otherwise UTF-16 machine byte ordered string.
*
* pdc_utf16be UTF-16 big endian formatted string.
*
* pdc_utf16le UTF-16 little endian formatted string.
*
* codepage: OEM multi byte code-page number. If > 0 and
* <inutf> = pdc_auto, text will be converted to UTF-16.
*
* inev: Encoding vector for input pdc_bytes string.
*
* glyphtab: Mapping table for character reference names
*
* tabsize: Size of mapping table
*
* replchar: Treatment of non resolvable character references:
* >= 0: replacement character
* == text_error: error message
* == text_nocheck: will be ignored
* (see also pdc_charref2unicodelist())
*
* instring: Input string.
*
* inlen: Length of input string in byte.
*
* oututf: Target format for output string.
* pdc_auto, pdc_auto2 and pdc_bytes2 are not supported.
*
* outev: Encoding vector for output pdc_bytes string.
*
* flags: PDC_CONV_FORCEUTF16:
* In the case of <inutf> = pdc_auto[2] and <inev> != NULL
* <inutf> = pdc_utf16 will be forced.
*
* PDC_CONV_TRY7BYTES:
* UTF-8 output strings will have no BOM if each byte
* is smaller than x80.
* *oututf: pdc_byte.
*
* PDC_CONV_TRYBYTES:
* UTF-UTF-16xx output strings will be converted by LBP
* if each character is smaller than x0100.
* *oututf: pdc_byte.
*
* PDC_CONV_WITHBOM:
* UTF-8 or UTF-UTF-16xx output strings will be armed
* with an appropriate BOM.
*
* PDC_CONV_NOBOM:
* In UTF-8 or UTF-UTF-16xx output strings any BOM sequence
* will be removed. PDC_CONV_WITHBOM is dominant.
*
* PDC_CONV_AUTOBOM:
* BOM sequence will be set automatically if input string
* has a BOM.
*
* PDC_CONV_ANALYZE:
* Only analyzing BOMs of input string and dissolving auto
* textformats.
*
* PDC_CONV_TMPALLOC
* Temporary memory functions (pdc_malloc_tmp) are used
* rather than pdc_malloc etc.
*
* PDC_CONV_HTMLCHAR
* If input encoding vector is specified HTML character
* entities will be substituted.
*
* PDC_CONV_NEWALLOC
* Input string must be allocated at first to guarantee
* pointer alignment.
*
* PDC_CONV_INFLATE
* Invalid UTF-8 to UTF-16xx conversion will not cause
* an exception but rather an inflated byte string will
* be output.
*
* PDC_CONV_ESCSEQU
* Unicode sequences framed by escape character U+001B
* (found in PDF text strings) will be skipped.
*
* PDC_CONV_BSSEQU
* Code sequences beginning with backslash '\'
* will be substituted.
*
* PDC_CONV_ENCERROR
* If an 8-bit code cannot be converted to Unicode by <inev>
* or a Unicode cannot be converted to an 8-bit code by <outev>
* an error message will be created.
*
* PDC_CONV_KEEPLBCHAR
* In the case of PDC_CONV_ENCERROR relevant characters for
* line breaking do not lead to an error message.
*
* PDC_CONV_LOGGING
* Enables logging.
*
* verbose: Error messages are put out. Otherwise they are saved only.
*
* Output-Parameter:
*
* oututf: Reached format for output string.
*
* outstring: Pointer of allocated output string
*
* outlen: Length of output string.
*
*/
#if defined(_MSC_VER) && defined(_MANAGED)
#pragma unmanaged
#endif
int
pdc_convert_string(pdc_core *pdc,
pdc_text_format inutf, int codepage,
pdc_encodingvector *inev,
pdc_byte *instring, int inlen,
pdc_text_format *oututf_p, pdc_encodingvector *outev,
pdc_byte **outstring, int *outlen, int flags,
pdc_bool verbose)
{
/* text_nocheck: see bug #1664 */
return pdc_convert_textstring(pdc, inutf, codepage, inev,
NULL, 0, text_nocheck, instring, inlen, oututf_p, outev,
outstring, outlen, flags, verbose);
}
int
pdc_convert_textstring(pdc_core *pdc,
pdc_text_format inutf, int codepage,
pdc_encodingvector *inev,
const pdc_glyph_tab *glyphtab, int tabsize, int replchar,
pdc_byte *instring, int inlen,
pdc_text_format *oututf_p, pdc_encodingvector *outev,
pdc_byte **outstring, int *outlen, int flags,
pdc_bool verbose)
{
static const char *fn = "pdc_convert_textstring";
pdc_bool logg = flags & PDC_CONV_LOGGING;
const char *stemp1 = NULL, *stemp2 = NULL;
char sbuf[64];
pdc_text_format oututf = *oututf_p;
pdc_text_format oututf_s;
pdc_ushort *usinstr = (pdc_ushort *) instring;
pdc_ushort uv = 0;
pdc_byte *instr = NULL;
pdc_bool inalloc = pdc_false;
pdc_bool hasbom = pdc_false;
pdc_bool toswap = pdc_false;
int errcode = 0;
int i, j, n, len = 0;
(void) glyphtab;
(void) tabsize;
(void) replchar;
if (logg || pdc_logg_is_enabled(pdc, 5, trc_encoding))
{
pdc_logg(pdc, "\n");
if (!logg)
pdc_logg(pdc, "\t\ttext string of length %d will be converted...\n",
inlen);
logg = pdc_true;
}
if (logg)
{
pdc_logg(pdc, "\t\tinput textformat for string conversion: %s\n",
pdc_get_keyword(inutf, pdc_textformat_keylist));
if (inev != NULL)
pdc_logg(pdc, "\t\tinput encoding: %s\n", inev->apiname);
if (outev != NULL)
pdc_logg(pdc, "\t\toutput encoding: %s\n", outev->apiname);
}
/* prophylactic */
if (!inlen)
{
instring = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, 4, fn, NULL, NULL) :
pdc_calloc(pdc, 4, fn));
inalloc = pdc_true;
}
else if ((flags & PDC_CONV_NEWALLOC) ||
(flags & PDC_CONV_TMPALLOC) ||
(flags & PDC_CONV_BSSEQU))
{
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (inlen + 2), fn));
memcpy(instr, instring, (size_t) inlen);
inalloc = pdc_true;
instring = instr;
instr = NULL;
usinstr = (pdc_ushort *) instring;
}
switch(inutf)
{
/* analyzing 2 byte textformat */
case pdc_auto2:
case pdc_bytes2:
if ((inutf == pdc_auto2 &&
(inev == NULL || (flags & PDC_CONV_FORCEUTF16))) ||
(flags & PDC_CONV_ANALYZE))
{
inutf = pdc_utf16;
}
else
{
if (logg)
pdc_logg(pdc, "\t\ttry to pick low bytes\n");
len = inlen / 2;
if (2 * len != inlen)
{
errcode = PDC_E_CONV_ILLUTF16;
goto PDC_CONV_ERROR;
}
for (i = 0; i < len; i++)
if (usinstr[i] > PDC_UNICODE_MAXLATIN1)
break;
/* low byte picking */
if (i == len)
{
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (len + 2), fn));
for (i = 0; i < len; i++)
instr[i] = (pdc_byte) usinstr[i];
if (inalloc)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instring);
else
pdc_free(pdc, instring);
}
inalloc = pdc_true;
instring = instr;
instr = NULL;
inlen = len;
if (inutf == pdc_bytes2)
inutf = pdc_bytes;
else
inutf = pdc_auto;
}
else
{
inutf = pdc_utf16;
}
}
break;
/* OEM multi byte text strings */
case pdc_auto:
case pdc_bytes:
if (codepage > 0)
{
#if defined(WIN32)
if (!(flags & PDC_CONV_ANALYZE) && inlen > 0)
{
if (logg)
pdc_logg(pdc,
"\t\tconverting according Windows codepage %d\n",
codepage);
len = MultiByteToWideChar((UINT) codepage, (DWORD) 0,
(LPCSTR) instring, inlen, NULL, 0);
if (len == 0)
{
DWORD lasterror = GetLastError();
stemp1 = pdc_errprintf(pdc, "cp%d", codepage);
if (lasterror == ERROR_INVALID_PARAMETER)
{
errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM;
}
else
{
errcode = PDC_E_CONV_ILL_MBTEXTSTRING;
}
goto PDC_CONV_ERROR;
}
len *= 2;
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn,
NULL, NULL) :
pdc_calloc(pdc, (size_t) (len + 2), fn));
MultiByteToWideChar((UINT) codepage, (DWORD) 0, (LPCSTR)
instring, inlen,
(LPWSTR) instr, len);
if (inalloc)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instring);
else
pdc_free(pdc, instring);
}
inalloc = pdc_true;
instring = instr;
instr = NULL;
inlen = len;
inutf = pdc_utf16;
}
else
{
inutf = pdc_bytes;
}
#else /* WIN32 */
errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM;
goto PDC_CONV_ERROR;
#endif /* !WIN32 */
}
break;
default:
break;
}
/* analyzing UTF-16 textformat */
if (inutf == pdc_utf16)
{
if (pdc_is_utf16be_unicode(instring))
inutf = pdc_utf16be;
else if (pdc_is_utf16le_unicode(instring))
inutf = pdc_utf16le;
}
/* analyzing auto textformat */
else if (inutf == pdc_auto)
{
if (pdc_is_utf8_bytecode(instring))
inutf = PDC_UTF8;
else if (pdc_is_utf16be_unicode(instring))
inutf = pdc_utf16be;
else if (pdc_is_utf16le_unicode(instring))
inutf = pdc_utf16le;
else if (inev && !(flags & PDC_CONV_FORCEUTF16))
inutf = pdc_bytes;
else
inutf = pdc_utf16;
}
if (logg)
pdc_logg(pdc, "\t\tdetermined textformat: %s\n",
pdc_get_keyword(inutf, pdc_textformat_keylist));
/* only analyzing */
if (flags & PDC_CONV_ANALYZE)
goto PDC_CONV_EXIT;
/* conversion to UTF-16 by swapping */
if ((inutf == pdc_utf16be || inutf == pdc_utf16le) &&
(inutf != oututf || flags & PDC_CONV_TRYBYTES ||
flags & PDC_CONV_HTMLCHAR))
{
if (inlen &&
((inutf == pdc_utf16be && !PDC_ISBIGENDIAN) ||
(inutf == pdc_utf16le && PDC_ISBIGENDIAN)))
{
if (inalloc)
pdc_swap_bytes2((char *) instring, inlen, NULL);
else
{
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (inlen + 2), fn));
pdc_swap_bytes2((char *) instring, inlen, (char *) instr);
inalloc = pdc_true;
instring = instr;
instr = NULL;
}
}
inutf = pdc_utf16;
}
/* conversion to UTF-32 by swapping */
if (inlen && inutf == pdc_utf32)
{
if ((pdc_is_utf32be_unicode(instring) && !PDC_ISBIGENDIAN) ||
(pdc_is_utf32le_unicode(instring) && PDC_ISBIGENDIAN))
{
if (inalloc)
pdc_swap_bytes4((char *) instring, inlen, NULL);
else
{
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (inlen + 4), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (inlen + 4), fn));
pdc_swap_bytes4((char *) instring, inlen, (char *) instr);
inalloc = pdc_true;
instring = instr;
instr = NULL;
}
}
}
/* illegal UTF-16 / UTF-32 */
if (inutf >= pdc_utf16 && inlen % 2)
{
if (inutf == pdc_utf32 && inlen % 4)
errcode = PDC_E_CONV_ILLUTF32;
else
errcode = PDC_E_CONV_ILLUTF16;
goto PDC_CONV_ERROR;
}
/* conversion to UTF-16 by inflation or encoding vector */
if (inutf == pdc_bytes &&
(oututf != pdc_bytes || flags & PDC_CONV_HTMLCHAR || inev != outev))
{
if (logg)
{
if (flags & PDC_CONV_HTMLCHAR)
pdc_logg(pdc, "\t\tbyte character entity substitution\n");
}
len = 2 * inlen;
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (len + 2), fn));
usinstr = (pdc_ushort *) instr;
j = 0;
for (i = 0; i < inlen; i++)
{
uv = (pdc_ushort) instring[i];
if (inev)
{
uv = inev->codes[uv];
if (!uv && (flags & PDC_CONV_ENCERROR) &&
(!(flags & PDC_CONV_KEEPLBCHAR) ||
!pdc_is_linebreaking_relchar(uv)))
{
errcode = PDC_E_ENC_NOTDEF_CODE;
stemp1 = pdc_errprintf(pdc, "x%02X", instring[i]);
stemp2 = inev->apiname;
goto PDC_CONV_ERROR;
}
}
usinstr[j] = uv;
j++;
}
if (inalloc)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instring);
else
pdc_free(pdc, instring);
}
inalloc = pdc_true;
instring = instr;
instr = NULL;
inlen = 2 * j;
inutf = pdc_utf16;
}
/* UTF conversion */
oututf_s = oututf;
if ((oututf_s == pdc_bytes && inutf == pdc_utf8) ||
oututf_s == pdc_utf16be || oututf_s == pdc_utf16le)
oututf_s = pdc_utf16;
if (inutf != oututf_s && oututf_s != pdc_bytes)
{
len = 4 * (inlen + 1);
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) len, fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) len, fn));
if (inlen)
{
pdc_convers_result result = conversionOK;
pdc_byte *instringa, *instra, *instringe, *instre;
UTF8 *isa8 = NULL, *ise8 = NULL;
UTF16 *isa16, *ise16;
UTF32 *isa32, *ise32;
if (logg)
pdc_logg(pdc, "\t\tUTF conversion\n");
instringa = instring;
instringe = instring + inlen;
instra = instr;
instre = instr + len;
if (inutf == pdc_utf8)
{
isa8 = (UTF8 *) instringa;
ise8 = (UTF8 *) instringe;
if (oututf_s == pdc_utf16)
{
isa16 = (UTF16 *) instra;
ise16 = (UTF16 *) instre;
result = pdc_convertUTF8toUTF16(&isa8, ise8,
&isa16, ise16,
strictConversion);
instra = (pdc_byte *) isa16;
instre = (pdc_byte *) ise16;
}
else
{
isa32 = (UTF32 *) instra;
ise32 = (UTF32 *) instre;
result = pdc_convertUTF8toUTF32(&isa8, ise8,
&isa32, ise32,
strictConversion);
instra = (pdc_byte *) isa32;
instre = (pdc_byte *) ise32;
}
}
else if (inutf == pdc_utf16)
{
isa16 = (UTF16 *) instringa;
ise16 = (UTF16 *) instringe;
if (oututf_s == pdc_utf8)
{
isa8 = (UTF8 *) instra;
ise8 = (UTF8 *) instre;
result = pdc_convertUTF16toUTF8(&isa16, ise16, &isa8, ise8,
strictConversion);
instra = (pdc_byte *) isa8;
instre = (pdc_byte *) ise8;
}
else
{
isa32 = (UTF32 *) instra;
ise32 = (UTF32 *) instre;
result = pdc_convertUTF16toUTF32(&isa16, ise16,
&isa32, ise32,
strictConversion);
instra = (pdc_byte *) isa32;
instre = (pdc_byte *) ise32;
}
}
else if (inutf == pdc_utf32)
{
isa32 = (UTF32 *) instringa;
ise32 = (UTF32 *) instringe;
if (oututf_s == pdc_utf8)
{
isa8 = (UTF8 *) instra;
ise8 = (UTF8 *) instre;
result = pdc_convertUTF32toUTF8(&isa32, ise32,
&isa8, ise8,
strictConversion);
instra = (pdc_byte *) isa8;
instre = (pdc_byte *) ise8;
}
else
{
isa16 = (UTF16 *) instra;
ise16 = (UTF16 *) instre;
result = pdc_convertUTF32toUTF16(&isa32, ise32,
&isa16, ise16,
strictConversion);
instra = (pdc_byte *) isa16;
instre = (pdc_byte *) ise16;
}
}
switch (result)
{
case targetExhausted:
errcode = PDC_E_CONV_MEMOVERFLOW;
break;
case sourceExhausted:
case sourceIllegal:
if (inutf == pdc_utf8)
{
UTF8 *bp, *bpe;
char *sb = sbuf;
bpe = MIN(ise8 - 1, isa8 + 3);
for (bp = isa8; bp <= bpe; bp++)
sb += sprintf(sb, "\\x%02X", *bp);
if (*bp)
sb += sprintf(sb, "...");
sb += sprintf(sb, " (");
for (bp = isa8; bp <= bpe; bp++)
sb += sprintf(sb, "%c", *bp);
if (*bp)
sb += sprintf(sb, "...");
sb += sprintf(sb, ")");
stemp1 = sbuf;
stemp2 = pdc_errprintf(pdc, "%d", isa8 - (UTF8 *)instringa);
if (flags & PDC_CONV_INFLATE)
{
pdc_warning(pdc, PDC_E_CONV_ILLUTF8SEQU, stemp1, stemp2,
0, 0);
pdc_inflate_ascii((char *) instring, inlen,
(char *) instr, pdc_utf16);
instra = instr + 2 * inlen;
}
else
{
errcode = PDC_E_CONV_ILLUTF8SEQU;
}
}
else
{
stemp1 = pdc_get_keyword((int)inutf, pdc_utfformat_keylist);
errcode = PDC_E_CONV_ILLUTF;
}
break;
default:
break;
}
if (errcode)
{
if (logg)
pdc_logg(pdc, "\t\tUTF conversion error %d\n", result);
goto PDC_CONV_ERROR;
}
inlen = instra - instr;
}
if (inalloc)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instring);
else
pdc_free(pdc, instring);
}
len = (oututf == pdc_utf32) ? inlen + 4 : inlen + 2;
if (inlen + 4 != len)
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_realloc_tmp(pdc, instr, (size_t) len, fn) :
pdc_realloc(pdc, instr, (size_t) len, fn));
instr[inlen] = 0;
instr[inlen + 1] = 0;
if (oututf == pdc_utf32)
{
instr[inlen + 2] = 0;
instr[inlen + 3] = 0;
}
inalloc = pdc_true;
instring = instr;
instr = NULL;
inutf = oututf_s;
}
if (inutf == pdc_bytes)
{
if (!inalloc)
{
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (inlen + 2), fn));
memcpy(instr, instring, (size_t) inlen);
inalloc = pdc_true;
instring = instr;
instr = NULL;
}
}
/* trying to reduce UTF-16 string to bytes string */
if (inutf == pdc_utf16 &&
(oututf == pdc_bytes || flags & PDC_CONV_TRYBYTES))
{
if (logg)
pdc_logg(pdc, "\t\ttry to reduce UTF-16 to bytes\n");
if (pdc_is_utf16be_unicode(instring) ||
pdc_is_utf16le_unicode(instring))
n = 1;
else
n = 0;
len = (inlen - n) / 2;
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (len + 2), fn));
usinstr = (pdc_ushort *) instring;
for (i = 0; i < len; i++)
{
uv = usinstr[i + n];
if (outev && uv)
{
j = pdc_get_encoding_bytecode(pdc, outev, uv);
if (j < 0 && (flags & PDC_CONV_ENCERROR) && oututf == pdc_bytes)
{
errcode = PDC_E_ENC_NOTDEF_UNICODE;
stemp1 = pdc_errprintf(pdc, "%04X", uv);
stemp2 = outev->apiname;
goto PDC_CONV_ERROR;
}
uv = (pdc_ushort) j;
}
if (uv > PDC_UNICODE_MAXLATIN1)
break;
instr[i] = (pdc_byte) uv;
}
if (i == len)
{
if (inalloc)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instring);
else
pdc_free(pdc, instring);
}
inalloc = pdc_true;
instring = instr;
instr = NULL;
inlen = len;
inutf = pdc_bytes;
}
else
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instr);
else
pdc_free(pdc, instr);
instr = NULL;
}
}
/* UTF-8 format */
if (inutf == pdc_utf8)
{
hasbom = pdc_is_utf8_unicode(instring);
if (flags & PDC_CONV_TRY7BYTES)
{
if (logg)
pdc_logg(pdc, "\t\ttry to reduce UTF-8 to 7-bit\n");
for (i = hasbom ? 3 : 0; i < inlen; i++)
if (instring[i] > PDC_UNICODE_MAXASCII)
break;
if (i == inlen)
{
flags &= ~PDC_CONV_WITHBOM;
flags |= PDC_CONV_NOBOM;
inutf = pdc_bytes;
}
}
else if (hasbom && (flags & PDC_CONV_AUTOBOM))
{
flags &= ~PDC_CONV_NOBOM;
flags |= PDC_CONV_WITHBOM;
}
else if ((flags & PDC_CONV_WITHBOM) && (flags & PDC_CONV_NOBOM))
{
flags &= ~PDC_CONV_NOBOM;
}
if (!inalloc || flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
{
i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 3 : 0;
j = (flags & PDC_CONV_NOBOM && hasbom) ? 3 : 0;
len = inlen + i - j;
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (len + 2), fn));
memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
instr[len] = 0;
if (inalloc)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instring);
else
pdc_free(pdc, instring);
}
inalloc = pdc_true;
instring = instr;
instr = NULL;
inlen = len;
hasbom = (flags & PDC_CONV_WITHBOM);
}
if (hasbom)
{
instring[0] = PDF_BOM2;
instring[1] = PDF_BOM3;
instring[2] = PDF_BOM4;
}
}
/* UTF-16 formats */
if (inutf == pdc_utf16 || inutf == pdc_utf16be || inutf == pdc_utf16le)
{
hasbom = pdc_is_utf16be_unicode(instring) ||
pdc_is_utf16le_unicode(instring);
if (hasbom && (flags & PDC_CONV_AUTOBOM))
{
flags &= ~PDC_CONV_NOBOM;
flags |= PDC_CONV_WITHBOM;
}
else if ((flags & PDC_CONV_WITHBOM) && (flags & PDC_CONV_NOBOM))
{
flags &= ~PDC_CONV_NOBOM;
}
if (!inalloc || oututf == pdc_utf16be || oututf == pdc_utf16le ||
flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
{
i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 2 : 0;
j = (flags & PDC_CONV_NOBOM && hasbom) ? 2 : 0;
len = inlen + i - j;
instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
pdc_calloc(pdc, (size_t) (len + 2), fn));
memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
if (inalloc)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instring);
else
pdc_free(pdc, instring);
}
instring = instr;
instr = NULL;
inlen = len;
hasbom = (flags & PDC_CONV_WITHBOM);
}
i = hasbom ? 2 : 0;
if (inutf == pdc_utf16)
{
if (oututf == pdc_utf16be)
{
inutf = pdc_utf16be;
toswap = !PDC_ISBIGENDIAN;
}
if (oututf == pdc_utf16le)
{
inutf = pdc_utf16le;
toswap = PDC_ISBIGENDIAN;
}
if (toswap)
pdc_swap_bytes2((char *) &instring[i], inlen - i, NULL);
}
if (hasbom)
{
if (inutf == pdc_utf16be ||
(inutf == pdc_utf16 && PDC_ISBIGENDIAN))
{
instring[0] = PDF_BOM0;
instring[1] = PDF_BOM1;
}
if (inutf == pdc_utf16le ||
(inutf == pdc_utf16 && !PDC_ISBIGENDIAN))
{
instring[0] = PDF_BOM1;
instring[1] = PDF_BOM0;
}
}
}
if (logg)
pdc_logg(pdc, "\t\ttextformat of converted string: %s\n",
pdc_get_keyword(inutf, pdc_textformat_keylist));
PDC_CONV_EXIT:
*oututf_p = inutf;
if (outlen)
*outlen = inlen;
*outstring = instring;
return 0;
PDC_CONV_ERROR:
if (outlen)
*outlen = 0;
*outstring = NULL;
if (errcode > 0)
pdc_set_errmsg(pdc, errcode, stemp1, stemp2, 0, 0);
if (instr != NULL)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instr);
else
pdc_free(pdc, instr);
}
if (inalloc)
{
if (flags & PDC_CONV_TMPALLOC)
pdc_free_tmp(pdc, instring);
else
pdc_free(pdc, instring);
}
if (verbose)
PDC_RETHROW(pdc);
return errcode;
}
#if defined(_MSC_VER) && defined(_MANAGED)
#pragma managed
#endif
/*
* pdc_convert_name_ext converts a string of name data type to UTF-8
*
* flags & PDC_CONV_EBCDIC: converts to EBCDIC-UTF-8
*
* len == 0: If the string has a [EBCDIC-]UTF-8 BOM or
* flags & PDC_CONV_ISUTF8 is set the string will be duplicated.
* Otherwise the string has encoding enc and codepage
* codepage.
* If enc == pdc_unicode the string is "UTF-16" encoded.
* Otherwise: If enc < pdc_winansi the string is "host" encoded.
*
* len > 0: The string is a UTF-16 string of len bytes.
*
*/
char *
pdc_convert_name_ext(pdc_core *pdc, const char *name, int len,
pdc_encoding enc, int codepage, int flags)
{
static const char fn[] = "pdc_convert_name_ext";
pdc_encodingvector *ev = NULL;
pdc_text_format nameformat = pdc_utf16;
pdc_text_format outnameformat = pdc_utf8;
pdc_byte *convname;
char *outname = NULL;
int outlen;
if (name == NULL)
return NULL;
if (len == 0)
{
/* already [EBCDIC-]UTF-8 encoded */
if ((flags & PDC_CONV_ISUTF8) || pdc_is_utf8_bytecode(name))
{
if (!(flags & PDC_CONV_WITHBOM))
flags |= PDC_CONV_NOBOM;
if (!(flags & PDC_CONV_EBCDIC))
flags |= PDC_CONV_ASCII;
/* On EBCDIC platforms EBCDIC-UTF-8 name strings are expected */
outname = pdc_strdup_ext(pdc, name, (flags & ~PDC_CONV_EBCDIC), fn);
if (outname != NULL)
return outname;
}
/* see bug #1486 */
if (enc == pdc_unicode)
{
/* UTF-16 encoded string */
len = (int) pdc_wstrlen(name);
}
else
{
/* 8-bit encoded string */
nameformat = pdc_bytes;
if (enc < pdc_winansi)
ev = pdc_get_encoding_vector(pdc,pdc_find_encoding(pdc,"host"));
else
ev = pdc_get_encoding_vector(pdc, enc);
len = (int) strlen(name);
}
}
if (flags & PDC_CONV_EBCDIC)
outnameformat = PDC_UTF8;
flags |= PDC_CONV_TRY7BYTES;
if (pdc->charref)
flags |= PDC_CONV_HTMLCHAR;
if (pdc->escapesequ)
flags |= PDC_CONV_BSSEQU;
/* convert to UTF-8 */
pdc_convert_string(pdc, nameformat, codepage, ev, (pdc_byte *) name, len,
&outnameformat, NULL, &convname, &outlen, flags,
pdc_true);
return (char *) convname;
}
char *
pdc_convert_name(pdc_core *pdc, const char *name, int len, int flags)
{
return pdc_convert_name_ext(pdc, name, len, pdc_invalidenc, 0, flags);
}
/* returned string is temporary allocated
*/
char *
pdc_utf8_to_hostbytes(pdc_core *pdc, pdc_bool honorlang, char *name)
{
static const char fn[] = "pdc_utf8_to_hostbytes";
pdc_encoding outenc = pdc_invalidenc;
pdc_encodingvector *outev = NULL;
pdc_text_format informat = PDC_UTF8;
pdc_text_format outformat = pdc_utf16;
pdc_byte *outname = NULL;
int len = (int) strlen(name);
{
(void) fn;
(void) honorlang;
outenc = pdc_find_encoding(pdc, "host");
}
outev = pdc_get_encoding_vector(pdc, outenc);
pdc_convert_string(pdc, informat, 0, NULL, (pdc_byte *) name, len,
&outformat, outev, &outname, &len,
PDC_CONV_TRYBYTES | PDC_CONV_NOBOM | PDC_CONV_TMPALLOC,
pdc_true);
if (outformat == pdc_utf16)
{
pdc_free_tmp(pdc, outname);
outname = NULL;
}
return (char *) outname;
}
/* returned string is temporary allocated
*/
char *
pdc_hostbytes_to_utf8(pdc_core *pdc, pdc_bool honorlang, char *name)
{
static const char fn[] = "pdc_hostbytes_to_utf8";
pdc_encoding inenc = pdc_invalidenc;
pdc_encodingvector *inev = NULL;
pdc_text_format informat = pdc_bytes;
pdc_text_format outformat = PDC_UTF8;
pdc_byte *outname = NULL;
int len = (int) strlen(name);
{
(void) fn;
(void) honorlang;
inenc = pdc_find_encoding(pdc, "host");
}
inev = pdc_get_encoding_vector(pdc, inenc);
pdc_convert_string(pdc, informat, 0, inev, (pdc_byte *) name, len,
&outformat, NULL, &outname, &len,
PDC_CONV_NOBOM | PDC_CONV_TMPALLOC, pdc_true);
return (char *) outname;
}
/* --------------------- basic UTF conversion functions --------------------- */
char *
pdc_utf16_to_utf8(pdc_core *pdc, const char *utf16string, int len, int flags,
int *size)
{
pdc_text_format outtextformat = pdc_utf8;
pdc_byte *utf8string = NULL;
int outlen;
if (!utf16string)
pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf16string", 0, 0, 0);
if (flags & PDC_CONV_EBCDIC)
outtextformat = PDC_UTF8;
flags |= PDC_CONV_AUTOBOM;
pdc_convert_string(pdc, pdc_utf16, 0, NULL,
(pdc_byte *) utf16string, len,
&outtextformat, NULL, &utf8string, &outlen,
flags, pdc_true);
if (size) *size = outlen;
return (char *) utf8string;
}
char *
pdc_utf8_to_utf16(pdc_core *pdc, const char *utf8string, const char *format,
int flags, int *size)
{
pdc_text_format textformat = pdc_utf8;
pdc_text_format outtextformat = pdc_utf16;
pdc_byte *utf16string = NULL;
int len;
if (!utf8string)
pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf8string", 0, 0, 0);
len = (int) strlen(utf8string);
if (format && *format)
{
int k = pdc_get_keycode_ci(format, pdc_textformat_keylist);
/* see bug #2175 */
if (k == PDC_KEY_NOTFOUND)
{
char **sfl;
const char *sf;
int ns, i;
sf = NULL;
ns = pdc_split_stringlist(pdc, format, NULL, 0, &sfl);
for (i = 0; i < ns; i++)
{
if (!strcmp(sfl[i], "inflate"))
flags |= PDC_CONV_INFLATE;
else
sf = sfl[i];
}
if (sf != NULL)
k = pdc_get_keycode_ci(sf, pdc_textformat_keylist);
else
k = pdc_utf16;
pdc_cleanup_stringlist(pdc, sfl);
}
if (k == PDC_KEY_NOTFOUND ||
((pdc_text_format) k != pdc_utf16 &&
(pdc_text_format) k != pdc_utf16be &&
(pdc_text_format) k != pdc_utf16le))
pdc_error(pdc, PDC_E_ILLARG_STRING, "format", format, 0, 0);
outtextformat = (pdc_text_format) k;
}
if (flags & PDC_CONV_EBCDIC)
textformat = PDC_UTF8;
if (outtextformat == pdc_utf16)
flags |= PDC_CONV_AUTOBOM;
else
flags |= PDC_CONV_WITHBOM;
pdc_convert_string(pdc, textformat, 0, NULL,
(pdc_byte *) utf8string, len,
&outtextformat, NULL, &utf16string, size,
flags, pdc_true);
return (char *) utf16string;
}
char *
pdc_utf16_to_utf32(pdc_core *pdc, const char *utf16string, int len, int *size)
{
pdc_text_format outtextformat = pdc_utf32;
pdc_byte *utf32string = NULL;
if (!utf16string)
pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf16string", 0, 0, 0);
pdc_convert_string(pdc, pdc_utf16, 0, NULL,
(pdc_byte *) utf16string, len,
&outtextformat, NULL, &utf32string, size,
0, pdc_true);
return (char *) utf32string;
}
char *
pdc_utf32_to_utf8(pdc_core *pdc, const char *utf32string, int len, int flags,
int *size)
{
pdc_text_format outtextformat = pdc_utf8;
pdc_byte *utf8string = NULL;
int outlen;
if (!utf32string)
pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf32string", 0, 0, 0);
if (flags & PDC_CONV_EBCDIC)
outtextformat = PDC_UTF8;
flags |= PDC_CONV_AUTOBOM;
pdc_convert_string(pdc, pdc_utf32, 0, NULL,
(pdc_byte *) utf32string, len,
&outtextformat, NULL, &utf8string, &outlen,
flags, pdc_true);
if (size) *size = outlen;
return (char *) utf8string;
}
char *
pdc_utf32_to_utf16(pdc_core *pdc, const char *utf32string, int len,
const char *format, int flags, int *size)
{
pdc_text_format textformat = pdc_utf32;
pdc_text_format outtextformat = pdc_utf16;
pdc_byte *utf16string = NULL;
if (!utf32string)
pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf32string", 0, 0, 0);
if (format && *format)
{
int k = pdc_get_keycode_ci(format, pdc_textformat_keylist);
if (k == PDC_KEY_NOTFOUND ||
((pdc_text_format) k != pdc_utf16 &&
(pdc_text_format) k != pdc_utf16be &&
(pdc_text_format) k != pdc_utf16le))
pdc_error(pdc, PDC_E_ILLARG_STRING, "format", format, 0, 0);
outtextformat = (pdc_text_format) k;
}
if (outtextformat == pdc_utf16)
flags |= PDC_CONV_AUTOBOM;
else
flags |= PDC_CONV_WITHBOM;
pdc_convert_string(pdc, textformat, 0, NULL,
(pdc_byte *) utf32string, len,
&outtextformat, NULL, &utf16string, size,
flags, pdc_true);
return (char *) utf16string;
}
int
pdc_char16_to_char32(pdc_core *pdc, const pdc_ushort *ustext, int *ic, int len,
pdc_bool verbose)
{
pdc_ushort uvh = ustext[*ic];
if (uvh < PDC_UNICODE_MINHIGHSUR || uvh > PDC_UNICODE_MAXLOWSUR)
{
return (int) uvh;
}
else
{
UTF16 *isa16 = (UTF16 *) &ustext[*ic];
pdc_ushort uvl = 0;
int icn = *ic + 1;
if (icn < len)
{
uvl = ustext[icn];
if (uvh <= PDC_UNICODE_MAXHIGHSUR)
{
if (uvl >= PDC_UNICODE_MINLOWSUR &&
uvl <= PDC_UNICODE_MAXLOWSUR)
{
int usv;
UTF16 *ise16 = isa16 + 2;
UTF32 *isa32 = (UTF32 *) &usv;
UTF32 *ise32 = isa32 + 1;
pdc_convers_result result = pdc_convertUTF16toUTF32(
&isa16, ise16, &isa32, ise32, strictConversion);
if (result == conversionOK)
{
*ic = icn;
return usv;
}
}
}
}
pdc_set_errmsg(pdc, PDC_E_CONV_ILLUTF16SUR,
pdc_errprintf(pdc, "%04X", uvh),
pdc_errprintf(pdc, "%04X", uvl), 0, 0);
if (verbose)
pdc_error(pdc, -1, 0, 0, 0, 0);
}
return -1;
}
int
pdc_char32_to_char16(pdc_core *pdc, int usv, pdc_ushort *uvlist,
pdc_bool verbose)
{
if (usv < PDC_NUM_BMPVAL)
{
uvlist[0] = (pdc_ushort) usv;
return 1;
}
else
{
UTF32 *isa32 = (UTF32 *) &usv;
UTF32 *ise32 = isa32 + 1;
UTF16 *isa16 = (UTF16 *) uvlist;
UTF16 *ise16 = isa16 + 2;
pdc_convers_result result = pdc_convertUTF32toUTF16(
&isa32, ise32, &isa16, ise16, strictConversion);
if (result == conversionOK)
{
return 2;
}
pdc_set_errmsg(pdc, PDC_E_CONV_ILLUTF32CHAR,
pdc_errprintf(pdc, "%05X", usv), 0, 0, 0);
if (verbose)
pdc_error(pdc, -1, 0, 0, 0, 0);
}
return 0;
}