/usr/include/ucommon/unicode.h

// Copyright (C) 2009-2010 David Sugar, Tycho Softworks.
//
// This file is part of GNU uCommon C++.
//
// GNU uCommon C++ is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// GNU uCommon C++ is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with GNU uCommon C++.  If not, see <http://www.gnu.org/licenses/>.

/**
 * Basic UCommon Unicode support.
 * This includes computing unicode transcoding and supporting a
 * UTF8-aware string class (UString).  We may add support for a wchar_t
 * aware string class as well, as some external api libraries may require
 * ucs-2 or 4 encoded strings.
 * @file ucommon/unicode.h
 */

/**
 * An example of some unicode-utf8 transcoding.
 * @example unicode.cpp
 */

#ifndef _UCOMMON_UNICODE_H_
#define _UCOMMON_UNICODE_H_

#ifndef _UCOMMON_STRING_H_
#include <ucommon/string.h>
#endif

NAMESPACE_UCOMMON

/**
 * 32 bit unicode character code.  We may extract this from a ucs2 or utf8
 * string.
 */
typedef int32_t ucs4_t;

/**
 * 16 bit unicode character code.  Java and some api's like these.
 */
typedef int16_t ucs2_t;

/**
 * Resolves issues where wchar_t is not defined.
 */
typedef void *unicode_t;

/**
 * A core class of ut8 encoded string functions.  This is a foundation for
 * all utf8 string processing.
 * @author David Sugar
 */
class __EXPORT utf8
{
public:
    /**
     * Size of "unicode_t" character codes, may not be ucs4_t size.
     */
    static const unsigned ucsize;

    /**
     * A convenient NULL pointer value.
     */
    static const char *nil;

    /**
     * Compute character size of utf8 string codepoint.
     * @param codepoint in string.
     * @return size of codepoint as utf8 encoded data, 0 if invalid.
     */
    static unsigned size(const char *codepoint);

    /**
     * Count ut8 encoded ucs4 codepoints in string.
     * @param string of utf8 data.
     * @return codepount count, 0 if empty or invalid.
     */
    static size_t count(const char *string);

    /**
     * Get codepoint offset in a string.
     * @param string of utf8 data.
     * @param position of codepoint in string, negative offsets are from tail.
     * @return offset of codepoint or NULL if invalid.
     */
    static char *offset(char *string, ssize_t position);

    /**
     * Convert a utf8 encoded codepoint to a ucs4 character value.
     * @param encoded utf8 codepoint.
     * @return ucs4 string or 0 if invalid.
     */
    static ucs4_t codepoint(const char *encoded);

    /**
     * How many chars requires to encode a given wchar string.
     * @param string of ucs4 data.
     * @return number of chars required to encode given string.
     */
    static size_t chars(const unicode_t string);

    /**
     * How many chars requires to encode a given unicode character.
     * @param character to encode.
     * @return number of chars required to encode given character.
     */
    static size_t chars(ucs4_t character);

    /**
     * Convert a unicode string into utf8.
     * @param string of unicode data to pack
     * @param buffer of character protocol to put data into.
     * @return number of code points converted.
     */
    static size_t unpack(const unicode_t string, CharacterProtocol& buffer);

    /**
     * Convert a utf8 string into a unicode data buffer.
     * @param unicode data buffer.
     * @param buffer of character protocol to pack from.
     * @param size of unicode data buffer in codepoints.
     * @return number of code points converted.
     */
    static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);

    /**
     * Dup a utf8 string into a ucs4_t string.
     */
    static ucs4_t *udup(const char *string);

    /**
     * Dup a utf8 string into a ucs2_t representation.
     */
    static ucs2_t *wdup(const char *string);

    /**
     * Find first occurance of character in string.
     * @param string to search in.
     * @param character code to search for.
     * @param start offset in string in codepoints.
     * @return pointer to first instance or NULL if not found.
     */
    static const char *find(const char *string, ucs4_t character, size_t start = 0);

    /**
     * Find last occurrence of character in string.
     * @param string to search in.
     * @param character code to search for.
     * @param end offset to start from in codepoints.
     * @return pointer to last instance or NULL if not found.
     */
    static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);

    /**
     * Count occurrences of a unicode character in string.
     * @param string to search in.
     * @param character code to search for.
     * @return count of occurrences.
     */
    static unsigned ccount(const char *string, ucs4_t character);

    /**
     * Get a unicode character from a character protocol.
     * @param buffer of character protocol to read from.
     * @return unicode character or EOF error.
     */
    static ucs4_t get(CharacterProtocol& buffer);

    /**
     * Push a unicode character to a character protocol.
     * @param character to push to file.
     * @param buffer of character protocol to push character to.
     * @return unicode character or EOF on error.
     */
    static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
};

/**
 * A copy-on-write utf8 string class that operates by reference count.  This
 * is derived from the classic uCommon String class by adding operations that
 * are utf8 encoding aware.
 * @author David Sugar <dyfet@gnutelephony.org>
 */
class __EXPORT UString : public String, public utf8
{
protected:
    /**
     * Create a new empty utf8 aware string object.
     */
    UString();

    /**
     * Create an empty string with a buffer pre-allocated to a specified size.
     * @param size of buffer to allocate.
     */
    UString(strsize_t size);

    /**
     * Create a utf8 aware string for a null terminated unicode string.
     * @param text of ucs4 encoded data.
     */
    UString(const unicode_t text);

    /**
     * Create a string from null terminated text up to a maximum specified
     * size.
     * @param text to use for string.
     * @param size limit of new string.
     */
    UString(const char *text, strsize_t size);

    /**
     * Create a string for a substring.  The end of the substring is a
     * pointer within the substring itself.
     * @param text to use for string.
     * @param end of text in substring.
     */
    UString(const unicode_t *text, const unicode_t *end);

    /**
     * Construct a copy of a string object.  Our copy inherits the same
     * reference counted instance of cstring as in the original.
     * @param existing string to copy from.
     */
    UString(const UString& existing);

    /**
     * Destroy string.  De-reference cstring.  If last reference to cstring,
     * then also remove cstring from heap.
     */
    virtual ~UString();

    /**
     * Get a new string object as a substring of the current object.
     * @param codepoint offset of substring.
     * @param size of substring in codepoints or 0 if to end.
     * @return string object holding substring.
     */
    UString get(strsize_t codepoint, strsize_t size = 0) const;

    /**
     * Extract a unicode byte sequence from utf8 object.
     * @param unicode data buffer.
     * @param size of data buffer.
     * @return codepoints copied.
     */
    size_t get(unicode_t unicode, size_t size) const;

    /**
     * Set a utf8 encoded string based on unicode data.
     * @param unicode text to set.
     */
    void set(const unicode_t unicode);

    /**
     * Add (append) unicode to a utf8 encoded string.
     * @param unicode text to add.
     */
    void add(const unicode_t unicode);

    /**
     * Return unicode character found at a specific codepoint in the string.
     * @param position of codepoint in string, negative values computed from end.
     * @return character code at specified position in string.
     */
    ucs4_t at(int position) const;

    /**
     * Extract a unicode byte sequence from utf8 object.
     * @param unicode data buffer.
     * @param size of data buffer.
     * @return codepoints copied.
     */
    inline size_t operator()(unicode_t unicode, size_t size) const
        {return get(unicode, size);};

    /**
     * Get a new substring through object expression.
     * @param codepoint offset of substring.
     * @param size of substring or 0 if to end.
     * @return string object holding substring.
     */
    UString operator()(int codepoint, strsize_t size) const;

    /**
     * Convenience method for left of string.
     * @param size of substring to gather in codepoints.
     * @return string object holding substring.
     */
    inline UString left(strsize_t size) const
        {return operator()(0, size);}

    /**
     * Convenience method for right of string.
     * @param offset of substring from right in codepoints.
     * @return string object holding substring.
     */
    inline UString right(strsize_t offset) const
        {return operator()(-((int)offset), 0);}

    /**
     * Convenience method for substring extraction.
     * @param offset into string.
     * @param size of string to return.
     * @return string object holding substring.
     */
    inline UString copy(strsize_t offset, strsize_t size) const
        {return operator()((int)offset, size);}

    /**
     * Cut (remove) text from string using codepoint offsets.
     * @param offset to start of text field to remove.
     * @param size of text field to remove or 0 to remove to end of string.
     */
    void cut(strsize_t offset, strsize_t size = 0);

    /**
     * Insert (paste) text into string using codepoint offsets.
     * @param offset to start paste.
     * @param text to paste.
     * @param size of text to paste.
     */
    void paste(strsize_t offset, const char *text, strsize_t size = 0);

    /**
     * Reference a string in the object by codepoint offset.  Positive
     * offsets are from the start of the string, negative from the
     * end.
     * @param offset to string position.
     * @return pointer to string data or NULL if invalid offset.
     */
    const char *operator()(int offset) const;

    /**
     * Reference a unicode character in string object by array offset.
     * @param position of codepoint offset to character.
     * @return character value at offset.
     */
    inline ucs4_t operator[](int position) const
        {return UString::at(position);};

    /**
     * Count codepoints in current string.
     * @return count of codepoints.
     */
    inline strsize_t count(void) const
        {return utf8::count(str->text);}

    /**
     * Count occurrences of a unicode character in string.
     * @param character code to search for.
     * @return count of occurrences.
     */
    unsigned ccount(ucs4_t character) const;

    /**
     * Find first occurrence of character in string.
     * @param character code to search for.
     * @param start offset in string in codepoints.
     * @return pointer to first instance or NULL if not found.
     */
    const char *find(ucs4_t character, strsize_t start = 0) const;

    /**
     * Find last occurrence of character in string.
     * @param character code to search for.
     * @param end offset to start from in codepoints.
     * @return pointer to last instance or NULL if not found.
     */
    const char *rfind(ucs4_t character, strsize_t end = npos) const;
};

/**
 * Pointer to utf8 encoded character data.  This is a kind of "char *" for
 * utf8 text.
 * @author David Sugar <dyfet@gnutelephony.org>
 */
class __EXPORT utf8_pointer
{
protected:
    uint8_t *text;

public:
    /**
     * Create a utf8 pointer set to NULL.
     */
    utf8_pointer();

    /**
     * Create a utf8 pointer for an existing char pointer.
     * @param string pointer to use.
     */
    utf8_pointer(const char *string);

    /**
     * Create a utf8 pointer as a copy of existing utf8 pointer.
     * @param copy of object to use.
     */
    utf8_pointer(const utf8_pointer& copy);

    /**
     * Iterative increment of a utf8 pointer to prior codepoint.
     * @return object incremented.
     */
    utf8_pointer& operator ++();

    /**
     * Iterative decrement of a utf8 pointer to next codepoint.
     * @return object decremented.
     */
    utf8_pointer& operator --();

    /**
     * Adjust utf8 pointer by specified codepoints forward.
     * @param offset to increment by.
     * @return object incremented.
     */
    utf8_pointer& operator +=(long offset);

    /**
     * Adjust utf8 pointer by specified codepoints backward.
     * @param offset to decrement by.
     * @return object decremented.
     */
    utf8_pointer& operator -=(long offset);

    /**
     * Get new utf8 string after adding a codepoint offset.
     * @param offset to add.
     * @return new utf8 pointer pointing to specified offset.
     */
    utf8_pointer operator+(long offset) const;

    /**
     * Get new utf8 string after subtracting a codepoint offset.
     * @param offset to subtract.
     * @return new utf8 pointer pointing to specified offset.
     */
    utf8_pointer operator-(long offset) const;

    /**
     * Check if text is valid pointer.
     * @return true if not NULL.
     */
    inline operator bool() const
        {return text != NULL;};

    /**
     * Check if text is an invalid pointer.
     * @return false if not NULL.
     */
    inline bool operator!() const
        {return text == NULL;};

    /**
     * Extract a unicode character from a specified codepoint.
     * @param codepoint offset to extract character from.
     * @return unicode character or 0.
     */
    ucs4_t operator[](long codepoint) const;

    /**
     * Assign a utf8 string to point to.
     * @param string to point to.
     * @return current object after set to string.
     */
    utf8_pointer& operator=(const char *string);

    /**
     * Iterative increment of a utf8 pointer to next codepoint.
     */
    void inc(void);

    /**
     * Iterative decrement of a utf8 pointer to prior codepoint.
     */
    void dec(void);

    /**
     * check if pointer equals another string.
     * @param string to check.
     * @return true if same memory address.
     */
    inline bool operator==(const char *string) const
        {return (const char *)text == string;};

    /**
     * check if pointer does not equal another string.
     * @param string to check.
     * @return false if same memory address.
     */
    inline bool operator!=(const char *string) const
        {return (const char *)text != string;};

    /**
     * Get unicode character pointed to by pointer.
     * @return unicode character we are pointing to.
     */
    inline  ucs4_t operator*() const
        {return utf8::codepoint((const char *)text);};

    /**
     * Get c string we point to.
     * @return string we point to.
     */
    inline char *c_str(void) const
        {return (char *)text;};

    /**
     * Convert utf8 pointer to a generic string pointer.
     * @return generic string pointer.
     */
    inline operator char*() const
        {return (char *)text;};

    /**
     * Get length of null terminated utf8 string in codepoints.
     * @return codepoint length of string.
     */
    inline size_t len(void) const
        {return utf8::count((const char *)text);};
};

inline ucs4_t *strudup(const char *string)
    {return utf8::udup(string);}

inline ucs2_t *strwdup(const char *string)
    {return utf8::wdup(string);}

__EXPORT unicode_t unidup(const char *string);

template<>
inline void dupfree<ucs2_t*>(ucs2_t *string)
    {::free(string);}

template<>
inline void dupfree<ucs4_t*>(ucs4_t *string)
    {::free(string);}

template<>
inline void dupfree<unicode_t>(unicode_t string)
    {::free(string);}

/**
 * Convenience type for utf8 encoded strings.
 */
typedef UString ustring_t;

/**
 * Convenience type for utf8_pointer strings.
 */
typedef utf8_pointer utf8_t;

END_NAMESPACE

#endif
libucommon-dev 6.0.7-1.1 / usr / include / ucommon / unicode.h