csutil.cxx | Explore in Territory

/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Hunspell, based on MySpell.
 *
 * The Initial Developers of the Original Code are
 * Kevin Hendricks (MySpell) and Németh László (Hunspell).
 * Portions created by the Initial Developers are Copyright (C) 2002-2005
 * the Initial Developers. All Rights Reserved.
 *
 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */
/*
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
 * And Contributors.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. All modifications to the source code must be clearly marked as
 *    such.  Binary redistributions based on modified source code
 *    must be clearly marked as modified versions in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <algorithm>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include <sstream>

#include "csutil.hxx"
#include "atypes.hxx"
#include "langnum.hxx"

// Unicode character encoding information
struct unicode_info { … };

#ifdef _WIN32
#include <windows.h>
#include <wchar.h>
#endif

#ifdef OPENOFFICEORG
#include <unicode/uchar.h>
#else
#ifndef MOZILLA_CLIENT
#include "utf_info.cxx"
#define UTF_LST_LEN …
#endif
#endif

#ifdef MOZILLA_CLIENT
#include "nsCOMPtr.h"
#include "nsIUnicodeEncoder.h"
#include "nsIUnicodeDecoder.h"
#include "nsUnicharUtils.h"
#include "mozilla/dom/EncodingUtils.h"

using mozilla::dom::EncodingUtils;
#endif

struct unicode_info2 { … };

static struct unicode_info2* utf_tbl = …;
static int utf_tbl_count = …;  // utf_tbl can be used by multiple Hunspell instances

void myopen(std::ifstream& stream, const char* path, std::ios_base::openmode mode)
{ … }

std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) { … }

int u8_u16(std::vector<w_char>& dest, const std::string& src) { … }

namespace {
class is_any_of { … };
}

std::string::const_iterator mystrsep(const std::string &str,
                                     std::string::const_iterator& start) { … }

// replaces strdup with ansi version
char* mystrdup(const char* s) { … }

// remove cross-platform text line end characters
void mychomp(std::string& s) { … }

// break text to lines
std::vector<std::string> line_tok(const std::string& text, char breakchar) { … }

// uniq line in place
void line_uniq(std::string& text, char breakchar)
{ … }

// uniq and boundary for compound analysis: "1\n\2\n\1" -> " ( \1 | \2 ) "
void line_uniq_app(std::string& text, char breakchar) { … }

// append s to ends of every lines in text
std::string& strlinecat(std::string& str, const std::string& apd) { … }

int fieldlen(const char* r) { … }

bool copy_field(std::string& dest,
                const std::string& morph,
                const std::string& var) { … }

std::string& mystrrep(std::string& str,
                      const std::string& search,
                      const std::string& replace) { … }

// reverse word
size_t reverseword(std::string& word) { … }

// reverse word
size_t reverseword_utf(std::string& word) { … }

void uniqlist(std::vector<std::string>& list) { … }

namespace {
unsigned char cupper(const struct cs_info* csconv, int nIndex) { … }

unsigned char clower(const struct cs_info* csconv, int nIndex) { … }

unsigned char ccase(const struct cs_info* csconv, int nIndex) { … }
}

w_char upper_utf(w_char u, int langnum) { … }

w_char lower_utf(w_char u, int langnum) { … }

// convert std::string to all caps
std::string& mkallcap(std::string& s, const struct cs_info* csconv) { … }

// convert std::string to all little
std::string& mkallsmall(std::string& s, const struct cs_info* csconv) { … }

std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u,
                                    int langnum) { … }

std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int langnum) { … }

std::string& mkinitcap(std::string& s, const struct cs_info* csconv) { … }

std::vector<w_char>& mkinitcap_utf(std::vector<w_char>& u, int langnum) { … }

std::string& mkinitsmall(std::string& s, const struct cs_info* csconv) { … }

std::vector<w_char>& mkinitsmall_utf(std::vector<w_char>& u, int langnum) { … }

// conversion function for protected memory
void store_pointer(char* dest, char* source) { … }

// conversion function for protected memory
char* get_stored_pointer(const char* s) { … }

#ifndef MOZILLA_CLIENT

// these are simple character mappings for the
// encodings supported
// supplying isupper, tolower, and toupper

static struct cs_info iso1_tbl[] = …;

static struct cs_info iso2_tbl[] = …;

static struct cs_info iso3_tbl[] = …;

static struct cs_info iso4_tbl[] = …;

static struct cs_info iso5_tbl[] = …;

static struct cs_info iso6_tbl[] = …;

static struct cs_info iso7_tbl[] = …;

static struct cs_info iso8_tbl[] = …;

static struct cs_info iso9_tbl[] = …;

static struct cs_info iso10_tbl[] = …;

static struct cs_info koi8r_tbl[] = …;

static struct cs_info koi8u_tbl[] = …;

static struct cs_info cp1251_tbl[] = …;

static struct cs_info iso13_tbl[] = …;

static struct cs_info iso14_tbl[] = …;

static struct cs_info iso15_tbl[] = …;

static struct cs_info iscii_devanagari_tbl[] = …;

static struct cs_info tis620_tbl[] = …;

struct enc_entry { … };

static struct enc_entry encds[] = …;

/* map to lower case and remove non alphanumeric chars */
static void toAsciiLowerAndRemoveNonAlphanumeric(const char* pName,
                                                 char* pBuf) { … }

struct cs_info* get_current_cs(const std::string& es) { … }
#else
// XXX This function was rewritten for mozilla. Instead of storing the
// conversion tables static in this file, create them when needed
// with help the mozilla backend.
struct cs_info* get_current_cs(const std::string& es) {
  struct cs_info* ccs = new cs_info[256];
  // Initialze the array with dummy data so that we wouldn't need
  // to return null in case of failures.
  for (int i = 0; i <= 0xff; ++i) {
    ccs[i].ccase = false;
    ccs[i].clower = i;
    ccs[i].cupper = i;
  }

  nsCOMPtr<nsIUnicodeEncoder> encoder;
  nsCOMPtr<nsIUnicodeDecoder> decoder;

  nsresult rv;

  nsAutoCString label(es.c_str());
  nsAutoCString encoding;
  if (!EncodingUtils::FindEncodingForLabelNoReplacement(label, encoding)) {
    return ccs;
  }
  encoder = EncodingUtils::EncoderForEncoding(encoding);
  decoder = EncodingUtils::DecoderForEncoding(encoding);
  encoder->SetOutputErrorBehavior(encoder->kOnError_Signal, nullptr, '?');
  decoder->SetInputErrorBehavior(decoder->kOnError_Signal);

  for (unsigned int i = 0; i <= 0xff; ++i) {
    bool success = false;
    // We want to find the upper/lowercase equivalents of each byte
    // in this 1-byte character encoding.  Call our encoding/decoding
    // APIs separately for each byte since they may reject some of the
    // bytes, and we want to handle errors separately for each byte.
    char lower, upper;
    do {
      if (i == 0)
        break;
      const char source = char(i);
      char16_t uni, uniCased;
      int32_t charLength = 1, uniLength = 1;

      rv = decoder->Convert(&source, &charLength, &uni, &uniLength);
      // Explicitly check NS_OK because we don't want to allow
      // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT.
      if (rv != NS_OK || charLength != 1 || uniLength != 1)
        break;
      uniCased = ToLowerCase(uni);
      rv = encoder->Convert(&uniCased, &uniLength, &lower, &charLength);
      // Explicitly check NS_OK because we don't want to allow
      // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT.
      if (rv != NS_OK || charLength != 1 || uniLength != 1)
        break;

      uniCased = ToUpperCase(uni);
      rv = encoder->Convert(&uniCased, &uniLength, &upper, &charLength);
      // Explicitly check NS_OK because we don't want to allow
      // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT.
      if (rv != NS_OK || charLength != 1 || uniLength != 1)
        break;

      success = true;
    } while (0);

    if (success) {
      ccs[i].cupper = upper;
      ccs[i].clower = lower;
    } else {
      ccs[i].cupper = i;
      ccs[i].clower = i;
    }

    if (ccs[i].clower != (unsigned char)i)
      ccs[i].ccase = true;
    else
      ccs[i].ccase = false;
  }

  return ccs;
}
#endif

// primitive isalpha() replacement for tokenization
std::string get_casechars(const char* enc) { … }

// language to encoding default map

struct lang_map { … };

static struct lang_map lang2enc[] = …;

int get_lang_num(const std::string& lang) { … }

#ifndef OPENOFFICEORG
#ifndef MOZILLA_CLIENT
void initialize_utf_tbl() {
  utf_tbl_count++;
  if (utf_tbl)
    return;
  utf_tbl = new unicode_info2[CONTSIZE];
  for (size_t j = 0; j < CONTSIZE; ++j) {
    utf_tbl[j].cletter = 0;
    utf_tbl[j].clower = (unsigned short)j;
    utf_tbl[j].cupper = (unsigned short)j;
  }
  for (size_t j = 0; j < UTF_LST_LEN; ++j) {
    utf_tbl[utf_lst[j].c].cletter = 1;
    utf_tbl[utf_lst[j].c].clower = utf_lst[j].clower;
    utf_tbl[utf_lst[j].c].cupper = utf_lst[j].cupper;
  }
}
#endif
#endif

void free_utf_tbl() { … }

unsigned short unicodetoupper(unsigned short c, int langnum) { … }

unsigned short unicodetolower(unsigned short c, int langnum) { … }

int unicodeisalpha(unsigned short c) { … }

/* get type of capitalization */
int get_captype(const std::string& word, cs_info* csconv) { … }

int get_captype_utf8(const std::vector<w_char>& word, int langnum) { … }

// strip all ignored characters in the string
size_t remove_ignored_chars_utf(std::string& word,
                                const std::vector<w_char>& ignored_chars) { … }

// strip all ignored characters in the string
size_t remove_ignored_chars(std::string& word,
                            const std::string& ignored_chars) { … }

bool parse_string(const std::string& line, std::string& out, int ln) { … }

bool parse_array(const std::string& line,
                 std::string& out,
                 std::vector<w_char>& out_utf16,
                 int utf8,
                 int ln) { … }
chromium/third_party/hunspell/src/hunspell/csutil.cxx