diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index e8d9ec0d6153a..09f488fc45513 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -592,6 +592,10 @@ else() option(LLVM_ENABLE_THREADS "Use threads if available." ON) endif() +set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON") + +set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON") + set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON") set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON") diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 43311dad457ec..9d59fea8799b1 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128) set(LLVM_HAS_LOGF128 "${HAS_LOGF128}") endif() +if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON) + message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON") +endif() + +# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing. +if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON)) + set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) + set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}") + if (LLVM_ENABLE_ICU STREQUAL FORCE_ON) + find_package(ICU REQUIRED COMPONENTS uc i18n) + if (NOT ICU_FOUND) + message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON") + endif() + else() + find_package(ICU COMPONENTS uc i18n) + endif() + set(HAVE_ICU ${ICU_FOUND}) + set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES}) +endif() + +# Check only for builtin iconv to avoid licensing issues. +if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU) + if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON) + find_package(Iconv REQUIRED) + if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN) + message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON") + endif() + else() + find_package(Iconv) + endif() + if(Iconv_FOUND AND Iconv_IS_BUILT_IN) + set(HAVE_ICONV 1) + endif() +endif() + # function checks check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM) find_package(Backtrace) diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index 7efac55ab0352..06d4756397911 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -236,6 +236,12 @@ /* Have host's ___chkstk_ms */ #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS} +/* Define if ICU library is available */ +#cmakedefine01 HAVE_ICU + +/* Define if iconv library is available */ +#cmakedefine01 HAVE_ICONV + /* Linker version detected at compile time. */ #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}" diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h new file mode 100644 index 0000000000000..e204b95dd2dd7 --- /dev/null +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -0,0 +1,140 @@ +//===-- TextEncoding.h - Text encoding conversion class -----------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides a utility class to convert between different character +/// set encodings. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_TEXT_ENCODING_H +#define LLVM_SUPPORT_TEXT_ENCODING_H + +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Config/config.h" +#include "llvm/Support/ErrorOr.h" + +#include +#include + +namespace llvm { + +template class SmallVectorImpl; + +namespace details { +class TextEncodingConverterImplBase { + +private: + /// Converts a string. + /// \param[in] Source source string + /// \param[out] Result container for converted string + /// \return error code in case something went wrong + /// + /// The following error codes can occur, among others: + /// - std::errc::argument_list_too_long: The result requires more than + /// std::numeric_limits::max() bytes. + /// - std::errc::illegal_byte_sequence: The input contains an invalid + /// multibyte sequence. + /// - std::errc::invalid_argument: The input contains an incomplete + /// multibyte sequence. + /// + /// If the destination encoding is stateful, the shift state will be set + /// to the initial state. + /// + /// In case of an error, the result string contains the successfully converted + /// part of the input string. + /// + virtual std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) = 0; + + /// Resets the converter to the initial state. + virtual void reset() = 0; + +public: + virtual ~TextEncodingConverterImplBase() = default; + + /// Converts a string and resets the converter to the initial state. + std::error_code convert(StringRef Source, SmallVectorImpl &Result) { + auto EC = convertString(Source, Result); + reset(); + return EC; + } +}; +} // namespace details + +// Names inspired by https://wg21.link/p1885. +enum class TextEncoding { + /// UTF-8 character set encoding. + UTF8, + + /// IBM EBCDIC 1047 character set encoding. + IBM1047 +}; + +/// Utility class to convert between different character encodings. +class TextEncodingConverter { + std::unique_ptr Converter; + + TextEncodingConverter( + std::unique_ptr Converter) + : Converter(std::move(Converter)) {} + +public: + /// Creates a TextEncodingConverter instance. + /// Returns std::errc::invalid_argument in case the requested conversion is + /// not supported. + /// \param[in] From the source character encoding + /// \param[in] To the target character encoding + /// \return a TextEncodingConverter instance or an error code + static ErrorOr create(TextEncoding From, + TextEncoding To); + + /// Creates a TextEncodingConverter instance. + /// Returns std::errc::invalid_argument in case the requested conversion is + /// not supported. + /// \param[in] From name of the source character encoding + /// \param[in] To name of the target character encoding + /// \return a TextEncodingConverter instance or an error code + static ErrorOr create(StringRef From, StringRef To); + + TextEncodingConverter(const TextEncodingConverter &) = delete; + TextEncodingConverter &operator=(const TextEncodingConverter &) = delete; + + TextEncodingConverter(TextEncodingConverter &&Other) + : Converter(std::move(Other.Converter)) {} + + TextEncodingConverter &operator=(TextEncodingConverter &&Other) { + if (this != &Other) + Converter = std::move(Other.Converter); + return *this; + } + + ~TextEncodingConverter() = default; + + /// Converts a string. + /// \param[in] Source source string + /// \param[out] Result container for converted string + /// \return error code in case something went wrong + std::error_code convert(StringRef Source, + SmallVectorImpl &Result) const { + return Converter->convert(Source, Result); + } + + ErrorOr convert(StringRef Source) const { + SmallString<100> Result; + auto EC = Converter->convert(Source, Result); + if (!EC) + return std::string(Result); + return EC; + } +}; + +} // namespace llvm + +#endif diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index df1e65f3a588c..09e93f5a2ca7d 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -257,6 +257,7 @@ add_llvm_component_library(LLVMSupport SuffixTree.cpp SystemUtils.cpp TarWriter.cpp + TextEncoding.cpp ThreadPool.cpp TimeProfiler.cpp Timer.cpp @@ -316,6 +317,14 @@ add_llvm_component_library(LLVMSupport Demangle ) +# Link ICU library if it is an external library. +if(ICU_FOUND) + target_link_libraries(LLVMSupport + PRIVATE + ${ICU_LIBRARIES} + ) +endif() + set(llvm_system_libs ${system_libs}) # This block is only needed for llvm-config. When we deprecate llvm-config and diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp new file mode 100644 index 0000000000000..969dd419ede72 --- /dev/null +++ b/llvm/lib/Support/TextEncoding.cpp @@ -0,0 +1,357 @@ +//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides utility classes to convert between different character +/// encodings. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/TextEncoding.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/ConvertEBCDIC.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +#if HAVE_ICU +#include +#elif HAVE_ICONV +#include +#endif + +using namespace llvm; + +// Normalize the charset name with the charset alias matching algorithm proposed +// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching. +static void normalizeCharSetName(StringRef CSName, + SmallVectorImpl &Normalized) { + bool PrevDigit = false; + for (auto Ch : CSName) { + if (isAlnum(Ch)) { + Ch = toLower(Ch); + if (Ch != '0' || PrevDigit) { + PrevDigit = isDigit(Ch); + Normalized.push_back(Ch); + } + } + } +} + +// Maps the encoding name to enum constant if possible. +static std::optional getKnownEncoding(StringRef Name) { + SmallString<16> Normalized; + normalizeCharSetName(Name, Normalized); + if (Normalized.equals("utf8")) + return TextEncoding::UTF8; + if (Normalized.equals("ibm1047")) + return TextEncoding::IBM1047; + return std::nullopt; +} + +LLVM_ATTRIBUTE_UNUSED static void +HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, + SmallVectorImpl &Result) { + // No space left in output buffer. Double the size of the underlying + // memory in the SmallVectorImpl, adjust pointer and length and continue + // the conversion. + Capacity = + (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size(); + Result.resize(0); + Result.resize_for_overwrite(Capacity); + Output = static_cast(Result.data()); + OutputLength = Capacity; +} + +namespace { +enum ConversionType { + UTF8ToIBM1047, + IBM1047ToUTF8, +}; + +// Support conversion between EBCDIC 1047 and UTF-8. This class uses +// built-in translation tables that allow for translation between the +// aforementioned encodings. The use of tables for conversion is only +// possible because EBCDIC 1047 is a single-byte, stateless encoding; other +// encodings are not supported. +class TextEncodingConverterTable final + : public details::TextEncodingConverterImplBase { + const ConversionType ConvType; + +public: + TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override {} +}; + +std::error_code +TextEncodingConverterTable::convertString(StringRef Source, + SmallVectorImpl &Result) { + switch (ConvType) { + case IBM1047ToUTF8: + ConverterEBCDIC::convertToUTF8(Source, Result); + return std::error_code(); + case UTF8ToIBM1047: + return ConverterEBCDIC::convertToEBCDIC(Source, Result); + } + llvm_unreachable("Invalid ConvType!"); + return std::error_code(); +} + +#if HAVE_ICU +struct UConverterDeleter { + void operator()(UConverter *Converter) const { + if (Converter) + ucnv_close(Converter); + } +}; +using UConverterUniquePtr = std::unique_ptr; + +class TextEncodingConverterICU final + : public details::TextEncodingConverterImplBase { + UConverterUniquePtr FromConvDesc; + UConverterUniquePtr ToConvDesc; + +public: + TextEncodingConverterICU(UConverterUniquePtr FromConverter, + UConverterUniquePtr ToConverter) + : FromConvDesc(std::move(FromConverter)), + ToConvDesc(std::move(ToConverter)) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override; +}; + +// TODO: The current implementation discards the partial result and restarts the +// conversion from the beginning if there is a conversion error due to +// insufficient buffer size. In the future, it would better to save the partial +// result and resume the conversion for the remaining string. +// TODO: Improve translation of ICU errors to error_code +std::error_code +TextEncodingConverterICU::convertString(StringRef Source, + SmallVectorImpl &Result) { + // Setup the input in case it has no backing data. + size_t InputLength = Source.size(); + const char *In = InputLength ? const_cast(Source.data()) : ""; + + // Setup the output. We directly write into the SmallVector. + size_t Capacity = Result.capacity(); + size_t OutputLength = Capacity; + Result.resize_for_overwrite(Capacity); + char *Output; + UErrorCode EC = U_ZERO_ERROR; + + ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, + &EC); + ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, + NULL, &EC); + assert(U_SUCCESS(EC)); + + do { + EC = U_ZERO_ERROR; + const char *Input = In; + + Output = InputLength ? static_cast(Result.data()) : nullptr; + ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input, + In + InputLength, /*pivotStart=*/NULL, + /*pivotSource=*/NULL, /*pivotTarget=*/NULL, + /*pivotLimit=*/NULL, /*reset=*/true, + /*flush=*/true, &EC); + if (U_FAILURE(EC)) { + if (EC == U_BUFFER_OVERFLOW_ERROR) { + if (Capacity < Result.max_size()) { + HandleOverflow(Capacity, Output, OutputLength, Result); + continue; + } else + return std::error_code(E2BIG, std::generic_category()); + } + // Some other error occured. + Result.resize(Output - Result.data()); + return std::error_code(EILSEQ, std::generic_category()); + } + break; + } while (true); + + Result.resize(Output - Result.data()); + return std::error_code(); +} + +void TextEncodingConverterICU::reset() { + ucnv_reset(&*FromConvDesc); + ucnv_reset(&*ToConvDesc); +} + +#elif HAVE_ICONV +class TextEncodingConverterIconv final + : public details::TextEncodingConverterImplBase { + class UniqueIconvT { + iconv_t ConvDesc; + + public: + operator iconv_t() const { return ConvDesc; } + UniqueIconvT(iconv_t CD) : ConvDesc(CD) {} + ~UniqueIconvT() { + if (ConvDesc != (iconv_t)-1) { + iconv_close(ConvDesc); + ConvDesc = (iconv_t)-1; + } + } + UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) { + Other.ConvDesc = (iconv_t)-1; + } + UniqueIconvT &operator=(UniqueIconvT &&Other) { + if (&Other != this) { + ConvDesc = Other.ConvDesc; + Other.ConvDesc = (iconv_t)-1; + } + return *this; + } + }; + UniqueIconvT ConvDesc; + +public: + TextEncodingConverterIconv(UniqueIconvT ConvDesc) + : ConvDesc(std::move(ConvDesc)) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override; +}; + +// TODO: The current implementation discards the partial result and restarts the +// conversion from the beginning if there is a conversion error due to +// insufficient buffer size. In the future, it would better to save the partial +// result and resume the conversion for the remaining string. +std::error_code +TextEncodingConverterIconv::convertString(StringRef Source, + SmallVectorImpl &Result) { + // Setup the output. We directly write into the SmallVector. + size_t Capacity = Result.capacity(); + char *Output = static_cast(Result.data()); + size_t OutputLength = Capacity; + Result.resize_for_overwrite(Capacity); + + size_t Ret; + // Handle errors returned from iconv(). + auto HandleError = [&Capacity, &Output, &OutputLength, &Result, + this](size_t Ret) { + if (Ret == static_cast(-1)) { + // An error occured. Check if we can gracefully handle it. + if (errno == E2BIG && Capacity < Result.max_size()) { + HandleOverflow(Capacity, Output, OutputLength, Result); + // Reset converter + reset(); + return std::error_code(); + } else { + // Some other error occured. + Result.resize(Output - Result.data()); + return std::error_code(errno, std::generic_category()); + } + } else { + // A positive return value indicates that some characters were converted + // in a nonreversible way, that is, replaced with a SUB symbol. Returning + // an error in this case makes sure that both conversion routines behave + // in the same way. + return std::make_error_code(std::errc::illegal_byte_sequence); + } + }; + + do { + // Setup the input. Use nullptr to reset iconv state if input length is + // zero. + size_t InputLength = Source.size(); + char *Input = InputLength ? const_cast(Source.data()) : ""; + Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength); + if (Ret != 0) { + if (auto EC = HandleError(Ret)) + return EC; + continue; + } + // Flush the converter + Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength); + if (Ret != 0) { + if (auto EC = HandleError(Ret)) + return EC; + continue; + } + break; + } while (true); + + // Re-adjust size to actual size. + Result.resize(Output - Result.data()); + return std::error_code(); +} + +inline void TextEncodingConverterIconv::reset() { + iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); +} + +#endif // HAVE_ICONV +} // namespace + +ErrorOr +TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) { + + // Text encodings should be distinct. + if (CPFrom == CPTo) + return std::make_error_code(std::errc::invalid_argument); + + ConversionType Conversion; + if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047) + Conversion = UTF8ToIBM1047; + else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8) + Conversion = IBM1047ToUTF8; + else + return std::make_error_code(std::errc::invalid_argument); + + return TextEncodingConverter( + std::make_unique(Conversion)); +} + +ErrorOr TextEncodingConverter::create(StringRef From, + StringRef To) { + std::optional FromEncoding = getKnownEncoding(From); + std::optional ToEncoding = getKnownEncoding(To); + if (FromEncoding && ToEncoding) { + ErrorOr Converter = + create(*FromEncoding, *ToEncoding); + if (Converter) + return Converter; + } +#if HAVE_ICU + UErrorCode EC = U_ZERO_ERROR; + UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC)); + if (U_FAILURE(EC)) + return std::make_error_code(std::errc::invalid_argument); + + UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC)); + if (U_FAILURE(EC)) + return std::make_error_code(std::errc::invalid_argument); + + auto Converter = std::make_unique( + std::move(FromConvDesc), std::move(ToConvDesc)); + return TextEncodingConverter(std::move(Converter)); +#elif HAVE_ICONV + iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str()); + if (ConvDesc == (iconv_t)-1) + return std::make_error_code(std::errc::invalid_argument); + return TextEncodingConverter( + std::make_unique(ConvDesc)); +#else + return std::make_error_code(std::errc::invalid_argument); +#endif +} diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index b6b9398df5e2e..d048e871fd0fb 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -88,6 +88,7 @@ add_llvm_unittest(SupportTests SuffixTreeTest.cpp SwapByteOrderTest.cpp TarWriterTest.cpp + TextEncodingTest.cpp ThreadPool.cpp ThreadSafeAllocatorTest.cpp Threading.cpp diff --git a/llvm/unittests/Support/ConvertEBCDICTest.cpp b/llvm/unittests/Support/ConvertEBCDICTest.cpp index eec76879ac92c..557f29c391f9c 100644 --- a/llvm/unittests/Support/ConvertEBCDICTest.cpp +++ b/llvm/unittests/Support/ConvertEBCDICTest.cpp @@ -41,7 +41,7 @@ static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72" // String with Cyrillic character ya. static const char CyrillicUTF[] = "\xd0\xaf"; -TEST(CharSet, FromUTF8) { +TEST(ConverterEBCDIC, convertToEBCDIC) { // Hello string. StringRef Src(HelloA); SmallString<64> Dst; @@ -72,7 +72,7 @@ TEST(CharSet, FromUTF8) { Dst.clear(); } -TEST(CharSet, ToUTF8) { +TEST(ConverterEBCDIC, convertFromEBCDIC) { // Hello string. StringRef Src(HelloE); SmallString<64> Dst; diff --git a/llvm/unittests/Support/TextEncodingTest.cpp b/llvm/unittests/Support/TextEncodingTest.cpp new file mode 100644 index 0000000000000..a453c0a34a5fe --- /dev/null +++ b/llvm/unittests/Support/TextEncodingTest.cpp @@ -0,0 +1,299 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/TextEncoding.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Config/config.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +// String "Hello World!" +static const char HelloA[] = + "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a"; +static const char HelloE[] = + "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15"; + +// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +static const char ABCStrA[] = + "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52" + "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A" + "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A"; +static const char ABCStrE[] = + "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9" + "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91" + "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9"; + +// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë" +static const char AccentUTF[] = + "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89" + "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9" + "\xc3\xaa\xc3\xab"; +static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72" + "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53"; + +// String with Cyrillic character ya. +static const char CyrillicUTF[] = "\xd0\xaf"; + +// String "Earth地球". +// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and +// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII. +// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts +// back. +static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83"; +static const char EarthISO2022[] = + "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42"; +static const char EarthIBM939[] = + "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f"; +static const char EarthUTFExtraPartial[] = + "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5"; + +TEST(Encoding, FromUTF8) { + // Hello string. + StringRef Src(HelloA); + SmallString<64> Dst; + + ErrorOr Conv = + TextEncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047); + + // Converter should always exist between UTF-8 and IBM-1047 + EXPECT_TRUE(Conv); + + std::error_code EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloE, static_cast(Dst).c_str()); + Dst.clear(); + + // ABC string. + Src = ABCStrA; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrE, static_cast(Dst).c_str()); + Dst.clear(); + + // Accent string. + Src = AccentUTF; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentE, static_cast(Dst).c_str()); + Dst.clear(); + + // Cyrillic string. Results in error because not representable in 1047. + Src = CyrillicUTF; + EC = Conv->convert(Src, Dst); + EXPECT_EQ(EC, std::errc::illegal_byte_sequence); +} + +TEST(Encoding, ToUTF8) { + // Hello string. + StringRef Src(HelloE); + SmallString<64> Dst; + + ErrorOr Conv = + TextEncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8); + + // Converter should always exist between UTF-8 and IBM-1047 + EXPECT_TRUE(Conv); + + std::error_code EC = Conv->convert(Src, Dst); + + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloA, static_cast(Dst).c_str()); + Dst.clear(); + + // ABC string. + Src = ABCStrE; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrA, static_cast(Dst).c_str()); + Dst.clear(); + + // Accent string. + Src = AccentE; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentUTF, static_cast(Dst).c_str()); +} + +TEST(Encoding, RoundTrip) { + ErrorOr ConvToUTF16 = + TextEncodingConverter::create("IBM-1047", "UTF-16"); + +#if HAVE_ICU + EXPECT_TRUE(ConvToUTF16); +#else + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF16) { + ASSERT_EQ(ConvToUTF16.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } +#endif + + ErrorOr ConvToUTF32 = + TextEncodingConverter::create("UTF-16", "UTF-32"); + +#if HAVE_ICU + EXPECT_TRUE(ConvToUTF32); +#else + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF32) { + ASSERT_EQ(ConvToUTF32.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } +#endif + + ErrorOr ConvToEBCDIC = + TextEncodingConverter::create("UTF-32", "IBM-1047"); + +#if HAVE_ICU + EXPECT_TRUE(ConvToEBCDIC); +#else + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToEBCDIC) { + ASSERT_EQ(ConvToEBCDIC.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } +#endif + + // Setup source string. + char SrcStr[256]; + for (size_t I = 0; I < 256; ++I) + SrcStr[I] = (I + 1) % 256; + + SmallString<99> Dst1Str, Dst2Str, Dst3Str; + + std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str); + EXPECT_TRUE(!EC); + EC = ConvToUTF32->convert(Dst1Str, Dst2Str); + EXPECT_TRUE(!EC); + EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str); + EXPECT_TRUE(!EC); + EXPECT_STREQ(SrcStr, static_cast(Dst3Str).c_str()); +} + +TEST(Encoding, ShiftState2022) { + // Earth string. + StringRef Src(EarthUTF); + SmallString<8> Dst; + + ErrorOr ConvTo2022 = + TextEncodingConverter::create("UTF-8", "ISO-2022-JP"); + +#if HAVE_ICU + EXPECT_TRUE(ConvTo2022); +#else + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvTo2022) { + ASSERT_EQ(ConvTo2022.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } +#endif + + // Check that the string is properly converted. + std::error_code EC = ConvTo2022->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(EarthISO2022, static_cast(Dst).c_str()); +} + +TEST(Encoding, InvalidInput) { + // Earth string. + StringRef Src(EarthUTFExtraPartial); + SmallString<8> Dst; + + ErrorOr ConvTo2022 = + TextEncodingConverter::create("UTF-8", "ISO-2022-JP"); + +#if HAVE_ICU + EXPECT_TRUE(ConvTo2022); +#else + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvTo2022) { + ASSERT_EQ(ConvTo2022.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } +#endif + + // Check that the string failed to convert. + std::error_code EC = ConvTo2022->convert(Src, Dst); + EXPECT_TRUE(EC); +} + +TEST(Encoding, InvalidOutput) { + // Cyrillic in UTF-16 + ErrorOr ConvToUTF16 = + TextEncodingConverter::create("UTF-8", "UTF-16"); + +#if HAVE_ICU + EXPECT_TRUE(ConvToUTF16); +#else + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF16) { + ASSERT_EQ(ConvToUTF16.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } +#endif + + ErrorOr ConvToEBCDIC = + TextEncodingConverter::create("UTF-16", "IBM-1047"); + +#if HAVE_ICU + EXPECT_TRUE(ConvToEBCDIC); +#else + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToEBCDIC) { + ASSERT_EQ(ConvToEBCDIC.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } +#endif + + // Cyrillic string. Convert to UTF-16 and check if properly converted + StringRef Src(CyrillicUTF); + SmallString<8> Dst, Dst1; + std::error_code EC = ConvToUTF16->convert(Src, Dst); + EXPECT_TRUE(!EC); + + // Cyrillic string. Results in error because not representable in 1047. + EC = ConvToEBCDIC->convert(Dst, Dst1); + EXPECT_TRUE(EC); +} + +TEST(Encoding, ShiftStateIBM939) { + // Earth string. + StringRef Src(EarthUTF); + SmallString<64> Dst; + + ErrorOr ConvToIBM939 = + TextEncodingConverter::create("UTF-8", "IBM-939"); + +#if HAVE_ICU + EXPECT_TRUE(ConvToIBM939); +#else + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToIBM939) { + ASSERT_EQ(ConvToIBM939.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } +#endif + + // Check that the string is properly converted. + std::error_code EC = ConvToIBM939->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(EarthIBM939, static_cast(Dst).c_str()); +} + +} // namespace