text_processing.cc | Explore in Territory

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "text_processing.h"

#include <stdio.h>
#include <string.h>

namespace chrome_lang_id {
namespace CLD2 {
namespace {

static const int kMaxSpaceScan = …;  // Bytes

int minint(int a, int b) { … }

// Counts number of spaces; a little faster than one-at-a-time
// Doesn't count odd bytes at end
int CountSpaces4(const char *src, int src_len) { … }

// This uses a cheap predictor to get a measure of compression, and
// hence a measure of repetitiveness. It works on complete UTF-8 characters
// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
// all the time when done with a byte-based count. Sigh.
//
// To allow running prediction across multiple chunks, caller passes in current
// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
//
// Returns the number of *bytes* correctly predicted, increments by 1..4 for
// each correctly-predicted character.
//
// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
//

// TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen

int CountPredictedBytes(const char *isrc, int src_len, int *hash, int *tbl) { … }

// Backscan to word boundary, returning how many bytes n to go back
// so that src - n is non-space ans src - n - 1 is space.
// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
int BackscanToSpace(const char *src, int limit) { … }

// Forwardscan to word boundary, returning how many bytes n to go forward
// so that src + n is non-space ans src + n - 1 is space.
// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
int ForwardscanToSpace(const char *src, int limit) { … }

}  // namespace

// Must be exactly 4096 for cheap compressor.
static const int kPredictionTableSize = …;
static const int kChunksizeDefault = …;      // Squeeze 48-byte chunks
static const int kSpacesThreshPercent = …;   // Squeeze if >=30% spaces
static const int kPredictThreshPercent = …;  // Squeeze if >=40% predicted

// Remove portions of text that have a high density of spaces, or that are
// overly repetitive, squeezing the remaining text in-place to the front of the
// input buffer.
//
// Squeezing looks at density of space/prediced chars in fixed-size chunks,
// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
//
// Return the new, possibly-shorter length
//
// Result Buffer ALWAYS has leading space and trailing space space space NUL,
// if input does
//
int CheapSqueezeInplace(char *isrc, int src_len, int ichunksize) { … }

}  // namespace CLD2
}  // namespace chrome_lang_id
chromium/third_party/cld_3/src/src/script_span/text_processing.cc