// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "text_processing.h" #include <stdio.h> #include <string.h> namespace chrome_lang_id { namespace CLD2 { namespace { static const int kMaxSpaceScan = …; // Bytes int minint(int a, int b) { … } // Counts number of spaces; a little faster than one-at-a-time // Doesn't count odd bytes at end int CountSpaces4(const char *src, int src_len) { … } // This uses a cheap predictor to get a measure of compression, and // hence a measure of repetitiveness. It works on complete UTF-8 characters // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly // all the time when done with a byte-based count. Sigh. // // To allow running prediction across multiple chunks, caller passes in current // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. // // Returns the number of *bytes* correctly predicted, increments by 1..4 for // each correctly-predicted character. // // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text // // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen int CountPredictedBytes(const char *isrc, int src_len, int *hash, int *tbl) { … } // Backscan to word boundary, returning how many bytes n to go back // so that src - n is non-space ans src - n - 1 is space. // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary int BackscanToSpace(const char *src, int limit) { … } // Forwardscan to word boundary, returning how many bytes n to go forward // so that src + n is non-space ans src + n - 1 is space. // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary int ForwardscanToSpace(const char *src, int limit) { … } } // namespace // Must be exactly 4096 for cheap compressor. static const int kPredictionTableSize = …; static const int kChunksizeDefault = …; // Squeeze 48-byte chunks static const int kSpacesThreshPercent = …; // Squeeze if >=30% spaces static const int kPredictThreshPercent = …; // Squeeze if >=40% predicted // Remove portions of text that have a high density of spaces, or that are // overly repetitive, squeezing the remaining text in-place to the front of the // input buffer. // // Squeezing looks at density of space/prediced chars in fixed-size chunks, // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. // // Return the new, possibly-shorter length // // Result Buffer ALWAYS has leading space and trailing space space space NUL, // if input does // int CheapSqueezeInplace(char *isrc, int src_len, int ichunksize) { … } } // namespace CLD2 } // namespace chrome_lang_id