// // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) // // Distributed under the Boost Software License, Version 1.0. (See // accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // #ifndef BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP #define BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP #include #include #include #include #include "conv.hpp" #ifndef NOMINMAX # define NOMINMAX #endif #include #include namespace boost { namespace locale { namespace conv { namespace impl { struct windows_encoding { char const *name; unsigned codepage; unsigned was_tested; }; bool operator<(windows_encoding const &l,windows_encoding const &r) { return strcmp(l.name,r.name) < 0; } windows_encoding all_windows_encodings[] = { { "big5", 950, 0 }, { "cp1250", 1250, 0 }, { "cp1251", 1251, 0 }, { "cp1252", 1252, 0 }, { "cp1253", 1253, 0 }, { "cp1254", 1254, 0 }, { "cp1255", 1255, 0 }, { "cp1256", 1256, 0 }, { "cp1257", 1257, 0 }, { "cp874", 874, 0 }, { "cp932", 932, 0 }, { "cp936", 936, 0 }, { "eucjp", 20932, 0 }, { "euckr", 51949, 0 }, { "gb18030", 54936, 0 }, { "gb2312", 20936, 0 }, { "gbk", 936, 0 }, { "iso2022jp", 50220, 0 }, { "iso2022kr", 50225, 0 }, { "iso88591", 28591, 0 }, { "iso885913", 28603, 0 }, { "iso885915", 28605, 0 }, { "iso88592", 28592, 0 }, { "iso88593", 28593, 0 }, { "iso88594", 28594, 0 }, { "iso88595", 28595, 0 }, { "iso88596", 28596, 0 }, { "iso88597", 28597, 0 }, { "iso88598", 28598, 0 }, { "iso88599", 28599, 0 }, { "koi8r", 20866, 0 }, { "koi8u", 21866, 0 }, { "ms936", 936, 0 }, { "shiftjis", 932, 0 }, { "sjis", 932, 0 }, { "usascii", 20127, 0 }, { "utf8", 65001, 0 }, { "windows1250", 1250, 0 }, { "windows1251", 1251, 0 }, { "windows1252", 1252, 0 }, { "windows1253", 1253, 0 }, { "windows1254", 1254, 0 }, { "windows1255", 1255, 0 }, { "windows1256", 1256, 0 }, { "windows1257", 1257, 0 }, { "windows874", 874, 0 }, { "windows932", 932, 0 }, { "windows936", 936, 0 }, }; size_t remove_substitutions(std::vector &v) { if(std::find(v.begin(),v.end(),0) == v.end()) { return v.size(); } std::vector v2; v2.reserve(v.size()); for(unsigned i=0;i &buf) { buf.reserve(end-begin); while(begin!=end) { wchar_t wide_buf[4]; int n = 0; int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1; if(len == 2 && begin+1==end) return; n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4); for(int i=0;i &buf) { if(begin==end) return; int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0); if(n == 0) { if(do_skip) { multibyte_to_wide_one_by_one(codepage,begin,end,buf); return; } throw conversion_error(); } buf.resize(n,0); if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0) throw conversion_error(); } void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector &buf) { if(begin==end) return; BOOL substitute = FALSE; BOOL *substitute_ptr = codepage == 65001 || codepage == 65000 ? 0 : &substitute; char subst_char = 0; char *subst_char_ptr = codepage == 65001 || codepage == 65000 ? 0 : &subst_char; int n = WideCharToMultiByte(codepage,0,begin,end-begin,0,0,subst_char_ptr,substitute_ptr); buf.resize(n); if(WideCharToMultiByte(codepage,0,begin,end-begin,&buf[0],n,subst_char_ptr,substitute_ptr)==0) throw conversion_error(); if(substitute) { if(do_skip) remove_substitutions(buf); else throw conversion_error(); } } void wide_to_multibyte(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector &buf) { if(begin==end) return; buf.reserve(end-begin); wchar_t const *e = std::find(begin,end,L'\0'); wchar_t const *b = begin; for(;;) { std::vector tmp; wide_to_multibyte_non_zero(codepage,b,e,do_skip,tmp); size_t osize = buf.size(); buf.resize(osize+tmp.size()); std::copy(tmp.begin(),tmp.end(),buf.begin()+osize); if(e!=end) { buf.push_back('\0'); b=e+1; e=std::find(b,end,L'0'); } else break; } } int encoding_to_windows_codepage(char const *ccharset) { std::string charset = normalize_encoding(ccharset); windows_encoding ref; ref.name = charset.c_str(); size_t n = sizeof(all_windows_encodings)/sizeof(all_windows_encodings[0]); windows_encoding *begin = all_windows_encodings; windows_encoding *end = all_windows_encodings + n; windows_encoding *ptr = std::lower_bound(begin,end,ref); if(ptr!=end && strcmp(ptr->name,charset.c_str())==0) { if(ptr->was_tested) { return ptr->codepage; } else if(IsValidCodePage(ptr->codepage)) { // the thread safety is not an issue, maximum // it would be checked more then once ptr->was_tested=1; return ptr->codepage; } else { return -1; } } return -1; } template bool validate_utf16(CharType const *str,unsigned len) { CharType const *begin = str; CharType const *end = str+len; while(begin!=end) { utf::code_point c = utf::utf_traits::template decode(begin,end); if(c==utf::illegal || c==utf::incomplete) return false; } return true; } template void clean_invalid_utf16(CharType const *str,unsigned len,std::vector &out) { out.reserve(len); for(unsigned i=0;i(str[i]); if(0xD800 <= c && c<= 0xDBFF) { i++; if(i>=len) return; uint16_t c2=static_cast(str[i]); if(0xDC00 <= c2 && c2 <= 0xDFFF) { out.push_back(static_cast(c)); out.push_back(static_cast(c2)); } } else if(0xDC00 <= c && c <=0xDFFF) continue; else out.push_back(static_cast(c)); } } class wconv_between : public converter_between { public: wconv_between() : how_(skip), to_code_page_ (-1), from_code_page_ ( -1) { } bool open(char const *to_charset,char const *from_charset,method_type how) { how_ = how; to_code_page_ = encoding_to_windows_codepage(to_charset); from_code_page_ = encoding_to_windows_codepage(from_charset); if(to_code_page_ == -1 || from_code_page_ == -1) return false; return true; } virtual std::string convert(char const *begin,char const *end) { if(to_code_page_ == 65001 && from_code_page_ == 65001) return utf_to_utf(begin,end,how_); std::string res; std::vector tmp; // buffer for mb2w std::wstring tmps; // buffer for utf_to_utf wchar_t const *wbegin=0; wchar_t const *wend=0; if(from_code_page_ == 65001) { tmps = utf_to_utf(begin,end,how_); if(tmps.empty()) return res; wbegin = tmps.c_str(); wend = wbegin + tmps.size(); } else { multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp); if(tmp.empty()) return res; wbegin = &tmp[0]; wend = wbegin + tmp.size(); } if(to_code_page_ == 65001) { return utf_to_utf(wbegin,wend,how_); } std::vector ctmp; wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp); if(ctmp.empty()) return res; res.assign(&ctmp.front(),ctmp.size()); return res; } private: method_type how_; int to_code_page_; int from_code_page_; }; template class wconv_to_utf; template class wconv_from_utf; template<> class wconv_to_utf : public converter_to_utf , public wconv_between { public: virtual bool open(char const *cs,method_type how) { return wconv_between::open("UTF-8",cs,how); } virtual std::string convert(char const *begin,char const *end) { return wconv_between::convert(begin,end); } }; template<> class wconv_from_utf : public converter_from_utf , public wconv_between { public: virtual bool open(char const *cs,method_type how) { return wconv_between::open(cs,"UTF-8",how); } virtual std::string convert(char const *begin,char const *end) { return wconv_between::convert(begin,end); } }; template class wconv_to_utf : public converter_to_utf { public: typedef CharType char_type; typedef std::basic_string string_type; wconv_to_utf() : how_(skip), code_page_(-1) { } virtual bool open(char const *charset,method_type how) { how_ = how; code_page_ = encoding_to_windows_codepage(charset); return code_page_ != -1; } virtual string_type convert(char const *begin,char const *end) { if(code_page_ == 65001) { return utf_to_utf(begin,end,how_); } std::vector tmp; multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp); string_type res; if(!tmp.empty()) res.assign(reinterpret_cast(&tmp.front()),tmp.size()); return res; } private: method_type how_; int code_page_; }; template class wconv_from_utf : public converter_from_utf { public: typedef CharType char_type; typedef std::basic_string string_type; wconv_from_utf() : how_(skip), code_page_(-1) { } virtual bool open(char const *charset,method_type how) { how_ = how; code_page_ = encoding_to_windows_codepage(charset); return code_page_ != -1; } virtual std::string convert(CharType const *begin,CharType const *end) { if(code_page_ == 65001) { return utf_to_utf(begin,end,how_); } wchar_t const *wbegin = 0; wchar_t const *wend = 0; std::vector buffer; // if needed if(begin==end) return std::string(); if(validate_utf16(begin,end-begin)) { wbegin = reinterpret_cast(begin); wend = reinterpret_cast(end); } else { if(how_ == stop) { throw conversion_error(); } else { clean_invalid_utf16(begin,end-begin,buffer); if(!buffer.empty()) { wbegin = &buffer[0]; wend = wbegin + buffer.size(); } } } std::string res; if(wbegin==wend) return res; std::vector ctmp; wide_to_multibyte(code_page_,wbegin,wend,how_ == skip,ctmp); if(ctmp.empty()) return res; res.assign(&ctmp.front(),ctmp.size()); return res; } private: method_type how_; int code_page_; }; template class wconv_to_utf : public converter_to_utf { public: typedef CharType char_type; typedef std::basic_string string_type; wconv_to_utf() : how_(skip), code_page_(-1) { } virtual bool open(char const *charset,method_type how) { how_ = how; code_page_ = encoding_to_windows_codepage(charset); return code_page_ != -1; } virtual string_type convert(char const *begin,char const *end) { if(code_page_ == 65001) { return utf_to_utf(begin,end,how_); } std::vector buf; multibyte_to_wide(code_page_,begin,end,how_ == skip,buf); if(buf.empty()) return string_type(); return utf_to_utf(&buf[0],&buf[0]+buf.size(),how_); } private: method_type how_; int code_page_; }; template class wconv_from_utf : public converter_from_utf { public: typedef CharType char_type; typedef std::basic_string string_type; wconv_from_utf() : how_(skip), code_page_(-1) { } virtual bool open(char const *charset,method_type how) { how_ = how; code_page_ = encoding_to_windows_codepage(charset); return code_page_ != -1; } virtual std::string convert(CharType const *begin,CharType const *end) { if(code_page_ == 65001) { return utf_to_utf(begin,end,how_); } std::wstring tmp = utf_to_utf(begin,end,how_); std::vector ctmp; wide_to_multibyte(code_page_,tmp.c_str(),tmp.c_str()+tmp.size(),how_ == skip,ctmp); std::string res; if(ctmp.empty()) return res; res.assign(&ctmp.front(),ctmp.size()); return res; } private: method_type how_; int code_page_; }; } // impl } // conv } // locale } // boost #endif // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4