diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index cfd1a086c0fc2..9884ad3579268 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -572,6 +572,10 @@ else() option(LLVM_ENABLE_THREADS "Use threads if available." ON) endif() +set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON") + +set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON") + set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON") set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON") diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 15ae04f5a6913..e8ccbf6f006b9 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128) set(LLVM_HAS_LOGF128 "${HAS_LOGF128}") endif() +if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON) + message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON") +endif() + +# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing. +if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON)) + set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) + set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}") + if (LLVM_ENABLE_ICU STREQUAL FORCE_ON) + find_package(ICU REQUIRED COMPONENTS uc i18n) + if (NOT ICU_FOUND) + message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON") + endif() + else() + find_package(ICU COMPONENTS uc i18n) + endif() + set(HAVE_ICU ${ICU_FOUND}) + set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES}) +endif() + +# Check for builtin iconv to avoid licensing issues. +if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU) + if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON) + find_package(Iconv REQUIRED) + if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN) + message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON") + endif() + else() + find_package(Iconv) + endif() + if(Iconv_FOUND AND Iconv_IS_BUILT_IN) + set(HAVE_ICONV 1) + endif() +endif() + # function checks check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM) find_package(Backtrace) diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index 835201f2a45b0..5ea40f73b15af 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -240,6 +240,12 @@ /* Have host's ___chkstk_ms */ #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS} +/* Define if ICU library is available */ +#cmakedefine HAVE_ICU ${HAVE_ICU} + +/* Define if iconv library is available */ +#cmakedefine HAVE_ICONV ${HAVE_ICONV} + /* Linker version detected at compile time. */ #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}" diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h new file mode 100644 index 0000000000000..6a28cd19f4143 --- /dev/null +++ b/llvm/include/llvm/Support/CharSet.h @@ -0,0 +1,141 @@ +//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides a utility class to convert between different character +/// set encodings. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_CHARSET_H +#define LLVM_SUPPORT_CHARSET_H + +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Config/config.h" +#include "llvm/Support/ErrorOr.h" + +#include +#include + +namespace llvm { + +template class SmallVectorImpl; + +namespace details { +class CharSetConverterImplBase { + +private: + /// Converts a string. + /// \param[in] Source source string + /// \param[out] Result container for converted string + /// \return error code in case something went wrong + /// + /// The following error codes can occur, among others: + /// - std::errc::argument_list_too_long: The result requires more than + /// std::numeric_limits::max() bytes. + /// - std::errc::illegal_byte_sequence: The input contains an invalid + /// multibyte sequence. + /// - std::errc::invalid_argument: The input contains an incomplete + /// multibyte sequence. + /// + /// If the destination charset is a stateful character set, the shift state + /// will be set to the initial state. + /// + /// In case of an error, the result string contains the successfully converted + /// part of the input string. + /// + virtual std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) = 0; + + /// Resets the converter to the initial state. + virtual void reset() = 0; + +public: + virtual ~CharSetConverterImplBase() = default; + + /// Converts a string and resets the converter to the initial state. + std::error_code convert(StringRef Source, SmallVectorImpl &Result) { + auto EC = convertString(Source, Result); + reset(); + return EC; + } +}; +} // namespace details + +// Names inspired by https://wg21.link/p1885. +namespace text_encoding { +enum class id { + /// UTF-8 character set encoding. + UTF8, + + /// IBM EBCDIC 1047 character set encoding. + IBM1047 +}; +} // end namespace text_encoding + +/// Utility class to convert between different character set encodings. +class CharSetConverter { + std::unique_ptr Converter; + + CharSetConverter(std::unique_ptr Converter) + : Converter(std::move(Converter)) {} + +public: + /// Creates a CharSetConverter instance. + /// Returns std::errc::invalid_argument in case the requested conversion is + /// not supported. + /// \param[in] CSFrom the source character encoding + /// \param[in] CSTo the target character encoding + /// \return a CharSetConverter instance or an error code + static ErrorOr create(text_encoding::id CSFrom, + text_encoding::id CSTo); + + /// Creates a CharSetConverter instance. + /// Returns std::errc::invalid_argument in case the requested conversion is + /// not supported. + /// \param[in] CPFrom name of the source character encoding + /// \param[in] CPTo name of the target character encoding + /// \return a CharSetConverter instance or an error code + static ErrorOr create(StringRef CPFrom, StringRef CPTo); + + CharSetConverter(const CharSetConverter &) = delete; + CharSetConverter &operator=(const CharSetConverter &) = delete; + + CharSetConverter(CharSetConverter &&Other) + : Converter(std::move(Other.Converter)) {} + + CharSetConverter &operator=(CharSetConverter &&Other) { + if (this != &Other) + Converter = std::move(Other.Converter); + return *this; + } + + ~CharSetConverter() = default; + + /// Converts a string. + /// \param[in] Source source string + /// \param[out] Result container for converted string + /// \return error code in case something went wrong + std::error_code convert(StringRef Source, + SmallVectorImpl &Result) const { + return Converter->convert(Source, Result); + } + + ErrorOr convert(StringRef Source) const { + SmallString<100> Result; + auto EC = Converter->convert(Source, Result); + if (!EC) + return std::string(Result); + return EC; + } +}; + +} // namespace llvm + +#endif diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 98ffd829d80b8..d5d822784340b 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -162,6 +162,7 @@ add_llvm_component_library(LLVMSupport CachePruning.cpp Caching.cpp circular_raw_ostream.cpp + CharSet.cpp Chrono.cpp COM.cpp CodeGenCoverage.cpp @@ -315,6 +316,14 @@ add_llvm_component_library(LLVMSupport Demangle ) +# Link ICU library if it is an external library. +if(ICU_FOUND) + target_link_libraries(LLVMSupport + PRIVATE + ${ICU_LIBRARIES} + ) +endif() + set(llvm_system_libs ${system_libs}) # This block is only needed for llvm-config. When we deprecate llvm-config and diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp new file mode 100644 index 0000000000000..6810cf9c6e376 --- /dev/null +++ b/llvm/lib/Support/CharSet.cpp @@ -0,0 +1,344 @@ +//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides utility classes to convert between different character +/// set encodings. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CharSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/ConvertEBCDIC.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +#ifdef HAVE_ICU +#include +#elif defined(HAVE_ICONV) +#include +#endif + +using namespace llvm; + +// Normalize the charset name with the charset alias matching algorithm proposed +// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching. +static void normalizeCharSetName(StringRef CSName, + SmallVectorImpl &Normalized) { + bool PrevDigit = false; + for (auto Ch : CSName) { + if (isAlnum(Ch)) { + Ch = toLower(Ch); + if (Ch != '0' || PrevDigit) { + PrevDigit = isDigit(Ch); + Normalized.push_back(Ch); + } + } + } +} + +// Maps the charset name to enum constant if possible. +static std::optional getKnownCharSet(StringRef CSName) { + SmallString<16> Normalized; + normalizeCharSetName(CSName, Normalized); + if (Normalized.equals("utf8")) + return text_encoding::id::UTF8; + if (Normalized.equals("ibm1047")) + return text_encoding::id::IBM1047; + return std::nullopt; +} + +LLVM_ATTRIBUTE_UNUSED static void +HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, + SmallVectorImpl &Result) { + // No space left in output buffer. Double the size of the underlying + // memory in the SmallVectorImpl, adjust pointer and length and continue + // the conversion. + Capacity = (Capacity < std::numeric_limits::max() / 2) + ? 2 * Capacity + : std::numeric_limits::max(); + Result.resize(0); + Result.resize_for_overwrite(Capacity); + Output = static_cast(Result.data()); + OutputLength = Capacity; +} + +namespace { +enum ConversionType { + UTF8ToIBM1047, + IBM1047ToUTF8, +}; + +// Support conversion between EBCDIC 1047 and UTF-8. This class uses +// built-in translation tables that allow for translation between the +// aforementioned character sets. The use of tables for conversion is only +// possible because EBCDIC 1047 is a single-byte, stateless encoding; other +// character sets are not supported. +class CharSetConverterTable : public details::CharSetConverterImplBase { + const ConversionType ConvType; + +public: + CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override {} +}; + +std::error_code +CharSetConverterTable::convertString(StringRef Source, + SmallVectorImpl &Result) { + if (ConvType == IBM1047ToUTF8) { + ConverterEBCDIC::convertToUTF8(Source, Result); + return std::error_code(); + } else if (ConvType == UTF8ToIBM1047) { + return ConverterEBCDIC::convertToEBCDIC(Source, Result); + } + llvm_unreachable("Invalid ConvType!"); + return std::error_code(); +} + +#ifdef HAVE_ICU +struct UConverterDeleter { + void operator()(UConverter *Converter) const { + if (Converter) + ucnv_close(Converter); + } +}; +using UConverterUniquePtr = std::unique_ptr; + +class CharSetConverterICU : public details::CharSetConverterImplBase { + UConverterUniquePtr FromConvDesc; + UConverterUniquePtr ToConvDesc; + +public: + CharSetConverterICU(UConverterUniquePtr FromConverter, + UConverterUniquePtr ToConverter) + : FromConvDesc(std::move(FromConverter)), + ToConvDesc(std::move(ToConverter)) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override; +}; + +std::error_code +CharSetConverterICU::convertString(StringRef Source, + SmallVectorImpl &Result) { + // Setup the input in case it has no backing data. + size_t InputLength = Source.size(); + const char *In = InputLength ? const_cast(Source.data()) : ""; + + // Setup the output. We directly write into the SmallVector. + size_t Capacity = Result.capacity(); + size_t OutputLength = Capacity; + Result.resize_for_overwrite(Capacity); + char *Output = static_cast(Result.data()); + UErrorCode EC = U_ZERO_ERROR; + + ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, + &EC); + ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, + NULL, &EC); + assert(U_SUCCESS(EC)); + + do { + EC = U_ZERO_ERROR; + const char *Input = In; + + Output = InputLength ? static_cast(Result.data()) : nullptr; + ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input, + In + InputLength, /*pivotStart=*/NULL, + /*pivotSource=*/NULL, /*pivotTarget=*/NULL, + /*pivotLimit=*/NULL, /*reset=*/true, + /*flush=*/true, &EC); + if (U_FAILURE(EC)) { + if (EC == U_BUFFER_OVERFLOW_ERROR && + Capacity < std::numeric_limits::max()) { + HandleOverflow(Capacity, Output, OutputLength, Result); + continue; + } + // Some other error occured. + Result.resize(Output - Result.data()); + return std::error_code(EILSEQ, std::generic_category()); + } + break; + } while (true); + + Result.resize(Output - Result.data()); + return std::error_code(); +} + +void CharSetConverterICU::reset() { + ucnv_reset(&*FromConvDesc); + ucnv_reset(&*ToConvDesc); +} + +#elif defined(HAVE_ICONV) +class CharSetConverterIconv : public details::CharSetConverterImplBase { + class UniqueIconvT { + iconv_t ConvDesc; + + public: + operator iconv_t() const { return ConvDesc; } + UniqueIconvT(iconv_t CD) : ConvDesc(CD) {} + ~UniqueIconvT() { + if (ConvDesc != (iconv_t)-1) { + iconv_close(ConvDesc); + ConvDesc = (iconv_t)-1; + } + } + UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) { + Other.ConvDesc = (iconv_t)-1; + } + UniqueIconvT &operator=(UniqueIconvT &&Other) { + if (&Other != this) { + ConvDesc = Other.ConvDesc; + Other.ConvDesc = (iconv_t)-1; + } + return *this; + } + }; + UniqueIconvT ConvDesc; + +public: + CharSetConverterIconv(UniqueIconvT ConvDesc) + : ConvDesc(std::move(ConvDesc)) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override; +}; + +std::error_code +CharSetConverterIconv::convertString(StringRef Source, + SmallVectorImpl &Result) { + // Setup the output. We directly write into the SmallVector. + size_t Capacity = Result.capacity(); + char *Output = static_cast(Result.data()); + size_t OutputLength = Capacity; + Result.resize_for_overwrite(Capacity); + + size_t Ret; + // Handle errors returned from iconv(). + auto HandleError = [&Capacity, &Output, &OutputLength, &Result, + this](size_t Ret) { + if (Ret == static_cast(-1)) { + // An error occured. Check if we can gracefully handle it. + if (errno == E2BIG && Capacity < std::numeric_limits::max()) { + HandleOverflow(Capacity, Output, OutputLength, Result); + // Reset converter + iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); + return std::error_code(); + } else { + // Some other error occured. + Result.resize(Output - Result.data()); + return std::error_code(errno, std::generic_category()); + } + } else { + // A positive return value indicates that some characters were converted + // in a nonreversible way, that is, replaced with a SUB symbol. Returning + // an error in this case makes sure that both conversion routines behave + // in the same way. + return std::make_error_code(std::errc::illegal_byte_sequence); + } + }; + + do { + // Setup the input. Use nullptr to reset iconv state if input length is + // zero. + size_t InputLength = Source.size(); + char *Input = InputLength ? const_cast(Source.data()) : nullptr; + Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength); + if (Ret != 0) { + if (auto EC = HandleError(Ret)) + return EC; + continue; + } + // Flush the converter + Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength); + if (Ret != 0) { + if (auto EC = HandleError(Ret)) + return EC; + continue; + } + break; + } while (true); + + // Re-adjust size to actual size. + Result.resize(Output - Result.data()); + return std::error_code(); +} + +void CharSetConverterIconv::reset() { + iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); +} + +#endif // HAVE_ICONV +} // namespace + +ErrorOr CharSetConverter::create(text_encoding::id CPFrom, + text_encoding::id CPTo) { + + assert(CPFrom != CPTo && "Text encodings should be distinct"); + + ConversionType Conversion; + if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047) + Conversion = UTF8ToIBM1047; + else if (CPFrom == text_encoding::id::IBM1047 && + CPTo == text_encoding::id::UTF8) + Conversion = IBM1047ToUTF8; + else + return std::error_code(errno, std::generic_category()); + + std::unique_ptr Converter = + std::make_unique(Conversion); + return CharSetConverter(std::move(Converter)); +} + +ErrorOr CharSetConverter::create(StringRef CSFrom, + StringRef CSTo) { + std::optional From = getKnownCharSet(CSFrom); + std::optional To = getKnownCharSet(CSTo); + if (From && To) { + ErrorOr Converter = create(*From, *To); + if (Converter) + return Converter; + } +#ifdef HAVE_ICU + UErrorCode EC = U_ZERO_ERROR; + UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC)); + if (U_FAILURE(EC)) { + return std::error_code(errno, std::generic_category()); + } + UConverterUniquePtr ToConvDesc(ucnv_open(CSTo.str().c_str(), &EC)); + if (U_FAILURE(EC)) { + return std::error_code(errno, std::generic_category()); + } + std::unique_ptr Converter = + std::make_unique(std::move(FromConvDesc), + std::move(ToConvDesc)); + return CharSetConverter(std::move(Converter)); +#elif defined(HAVE_ICONV) + iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str()); + if (ConvDesc == (iconv_t)-1) + return std::error_code(errno, std::generic_category()); + std::unique_ptr Converter = + std::make_unique(ConvDesc); + return CharSetConverter(std::move(Converter)); +#else + return std::make_error_code(std::errc::invalid_argument); +#endif +} diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index 6c4e7cb689b20..d6e34fdaba4e7 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -17,6 +17,7 @@ add_llvm_unittest(SupportTests BalancedPartitioningTest.cpp BranchProbabilityTest.cpp CachePruningTest.cpp + CharSetTest.cpp CrashRecoveryTest.cpp Casting.cpp CheckedArithmeticTest.cpp diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp new file mode 100644 index 0000000000000..772d46ec73497 --- /dev/null +++ b/llvm/unittests/Support/CharSetTest.cpp @@ -0,0 +1,232 @@ +//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CharSet.h" +#include "llvm/ADT/SmallString.h" +#include "gtest/gtest.h" +using namespace llvm; + +namespace { + +// String "Hello World!" +static const char HelloA[] = + "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a"; +static const char HelloE[] = + "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15"; + +// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +static const char ABCStrA[] = + "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52" + "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A" + "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A"; +static const char ABCStrE[] = + "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9" + "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91" + "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9"; + +// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë" +static const char AccentUTF[] = + "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89" + "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9" + "\xc3\xaa\xc3\xab"; +static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72" + "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53"; + +// String with Cyrillic character ya. +static const char CyrillicUTF[] = "\xd0\xaf"; + +// String "Earth地球". +// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and +// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII. +// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts +// back. +static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83"; +static const char EarthISO2022[] = + "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42"; +static const char EarthIBM939[] = + "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f"; +static const char EarthUTFExtraPartial[] = + "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5"; + +TEST(CharSet, FromUTF8) { + // Hello string. + StringRef Src(HelloA); + SmallString<64> Dst; + + ErrorOr Conv = CharSetConverter::create( + text_encoding::id::UTF8, text_encoding::id::IBM1047); + + // Stop test if conversion is not supported. + if (!Conv) { + ASSERT_EQ(Conv.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + std::error_code EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloE, static_cast(Dst).c_str()); + Dst.clear(); + + // ABC string. + Src = ABCStrA; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrE, static_cast(Dst).c_str()); + Dst.clear(); + + // Accent string. + Src = AccentUTF; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentE, static_cast(Dst).c_str()); + Dst.clear(); + + // Cyrillic string. Results in error because not representable in 1047. + Src = CyrillicUTF; + EC = Conv->convert(Src, Dst); + EXPECT_EQ(EC, std::errc::illegal_byte_sequence); +} + +TEST(CharSet, ToUTF8) { + // Hello string. + StringRef Src(HelloE); + SmallString<64> Dst; + + ErrorOr Conv = CharSetConverter::create( + text_encoding::id::IBM1047, text_encoding::id::UTF8); + + // Stop test if conversion is not supported. + if (!Conv) { + ASSERT_EQ(Conv.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + std::error_code EC = Conv->convert(Src, Dst); + + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloA, static_cast(Dst).c_str()); + Dst.clear(); + + // ABC string. + Src = ABCStrE; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrA, static_cast(Dst).c_str()); + Dst.clear(); + + // Accent string. + Src = AccentE; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentUTF, static_cast(Dst).c_str()); +} + +TEST(CharSet, RoundTrip) { + ErrorOr ConvToUTF16 = + CharSetConverter::create("IBM-1047", "UTF-16"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF16) { + ASSERT_EQ(ConvToUTF16.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + ErrorOr ConvToUTF32 = + CharSetConverter::create("UTF-16", "UTF-32"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF32) { + ASSERT_EQ(ConvToUTF32.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + ErrorOr ConvToEBCDIC = + CharSetConverter::create("UTF-32", "IBM-1047"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToEBCDIC) { + ASSERT_EQ(ConvToEBCDIC.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Setup source string. + char SrcStr[256]; + for (size_t I = 0; I < 256; ++I) + SrcStr[I] = (I + 1) % 256; + + SmallString<99> Dst1Str, Dst2Str, Dst3Str; + + std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str); + EXPECT_TRUE(!EC); + EC = ConvToUTF32->convert(Dst1Str, Dst2Str); + EXPECT_TRUE(!EC); + EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str); + EXPECT_TRUE(!EC); + EXPECT_STREQ(SrcStr, static_cast(Dst3Str).c_str()); +} + +TEST(CharSet, ShiftState2022) { + // Earth string. + StringRef Src(EarthUTF); + SmallString<8> Dst; + + ErrorOr ConvTo2022 = + CharSetConverter::create("UTF-8", "ISO-2022-JP"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvTo2022) { + ASSERT_EQ(ConvTo2022.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Check that the string is properly converted. + std::error_code EC = ConvTo2022->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(EarthISO2022, static_cast(Dst).c_str()); +} + +TEST(CharSet, ShiftState2022Partial) { + // Earth string. + StringRef Src(EarthUTFExtraPartial); + SmallString<8> Dst; + + ErrorOr ConvTo2022 = + CharSetConverter::create("UTF-8", "ISO-2022-JP"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvTo2022) { + ASSERT_EQ(ConvTo2022.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Check that the string is properly converted. + std::error_code EC = ConvTo2022->convert(Src, Dst); + EXPECT_TRUE(EC); +} + +TEST(CharSet, ShiftStateIBM939) { + // Earth string. + StringRef Src(EarthUTF); + SmallString<64> Dst; + + ErrorOr ConvToIBM939 = + CharSetConverter::create("UTF-8", "IBM-939"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToIBM939) { + ASSERT_EQ(ConvToIBM939.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Check that the string is properly converted. + std::error_code EC = ConvToIBM939->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(EarthIBM939, static_cast(Dst).c_str()); +} + +} // namespace diff --git a/llvm/unittests/Support/ConvertEBCDICTest.cpp b/llvm/unittests/Support/ConvertEBCDICTest.cpp index eec76879ac92c..557f29c391f9c 100644 --- a/llvm/unittests/Support/ConvertEBCDICTest.cpp +++ b/llvm/unittests/Support/ConvertEBCDICTest.cpp @@ -41,7 +41,7 @@ static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72" // String with Cyrillic character ya. static const char CyrillicUTF[] = "\xd0\xaf"; -TEST(CharSet, FromUTF8) { +TEST(ConverterEBCDIC, convertToEBCDIC) { // Hello string. StringRef Src(HelloA); SmallString<64> Dst; @@ -72,7 +72,7 @@ TEST(CharSet, FromUTF8) { Dst.clear(); } -TEST(CharSet, ToUTF8) { +TEST(ConverterEBCDIC, convertFromEBCDIC) { // Hello string. StringRef Src(HelloE); SmallString<64> Dst;