chromium/third_party/cld_3/src/src/feature_extractor.h

/* Copyright 2016 Google Inc. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// Generic feature extractor for extracting features from objects. The feature
// extractor can be used for extracting features from any object. The feature
// extractor and feature function classes are template classes that have to
// be instantiated for extracting feature from a specific object type.
//
// A feature extractor consists of a hierarchy of feature functions. Each
// feature function extracts one or more feature type and value pairs from the
// object.
//
// The feature extractor has a modular design where new feature functions can be
// registered as components. The feature extractor is initialized from a
// descriptor represented by a protocol buffer. The feature extractor can also
// be initialized from a text-based source specification of the feature
// extractor. Feature specification parsers can be added as components. By
// default the feature extractor can be read from an ASCII protocol buffer or in
// a simple feature modeling language (fml).

// A feature function is invoked with a focus. Nested feature function can be
// invoked with another focus determined by the parent feature function.

#ifndef FEATURE_EXTRACTOR_H_
#define FEATURE_EXTRACTOR_H_

#include <stddef.h>
#include <memory>
#include <string>
#include <vector>

#include "base.h"
#include "cld_3/protos/feature_extractor.pb.h"
#include "feature_types.h"
#include "registry.h"
#include "script_span/stringpiece.h"
#include "task_context.h"
#include "utils.h"
#include "workspace.h"

namespace chrome_lang_id {

// TODO(djweiss) Clean this up as well.
// Use the same type for feature values as is used for predicated.
Predicate;
FeatureValue;

// Output feature model in FML format.
void ToFMLFunction(const FeatureFunctionDescriptor &function, string *output);
void ToFML(const FeatureFunctionDescriptor &function, string *output);

// A union used to represent discrete and continuous feature values.
FloatFeatureValue;

// A feature vector contains feature type and value pairs.
class FeatureVector {};

// The generic feature extractor is the type-independent part of a feature
// extractor. This holds the descriptor for the feature extractor and the
// collection of feature types used in the feature extractor.  The feature
// types are not available until FeatureExtractor<>::Init() has been called.
class GenericFeatureExtractor {};

// The generic feature function is the type-independent part of a feature
// function. Each feature function is associated with the descriptor that it is
// instantiated from.  The feature types associated with this feature function
// will be established by the time FeatureExtractor<>::Init() completes.
class GenericFeatureFunction {};

// Feature function that can extract features from an object.  Templated on
// two type arguments:
//
// OBJ:  The "object" from which features are extracted; e.g., a sentence.  This
//       should be a plain type, rather than a reference or pointer.
//
// ARGS: A set of 0 or more types that are used to "index" into some part of the
//       object that should be extracted, e.g. an int token index for a sentence
//       object.  This should not be a reference type.
template <class OBJ, class... ARGS>
class FeatureFunction
    : public GenericFeatureFunction,
      public RegisterableClass<FeatureFunction<OBJ, ARGS...> > {};

// Base class for features with nested feature functions. The nested functions
// are of type NES, which may be different from the type of the parent function.
// NB: NestedFeatureFunction will ensure that all initialization of nested
// functions takes place during Setup() and Init() -- after the nested features
// are initialized, the parent feature is initialized via SetupNested() and
// InitNested(). Alternatively, a derived classes that overrides Setup() and
// Init() directly should call Parent::Setup(), Parent::Init(), etc. first.
//
// Note: NestedFeatureFunction cannot know how to call Preprocess, Evaluate, or
// Compute, since the nested functions may be of a different type.
template <class NES, class OBJ, class... ARGS>
class NestedFeatureFunction : public FeatureFunction<OBJ, ARGS...> {};

// Base class for a nested feature function that takes nested features with the
// same signature as these features, i.e. a meta feature. For this class, we can
// provide preprocessing of the nested features.
template <class OBJ, class... ARGS>
class MetaFeatureFunction
    : public NestedFeatureFunction<FeatureFunction<OBJ, ARGS...>, OBJ,
                                   ARGS...> {};

// Template for a special type of locator: The locator of type
// FeatureFunction<OBJ, ARGS...> calls nested functions of type
// FeatureFunction<OBJ, IDX, ARGS...>, where the derived class DER is
// responsible for translating by providing the following:
//
// // Gets the new additional focus.
// IDX GetFocus(const WorkspaceSet &workspaces, const OBJ &object);
//
// This is useful to e.g. add a token focus to a parser state based on some
// desired property of that state.
template <class DER, class OBJ, class IDX, class... ARGS>
class FeatureAddFocusLocator
    : public NestedFeatureFunction<FeatureFunction<OBJ, IDX, ARGS...>, OBJ,
                                   ARGS...> {};

// CRTP feature locator class. This is a meta feature that modifies ARGS and
// then calls the nested feature functions with the modified ARGS. Note that in
// order for this template to work correctly, all of ARGS must be types for
// which the reference operator & can be interpreted as a pointer to the
// argument. The derived class DER must implement the UpdateFocus method which
// takes pointers to the ARGS arguments:
//
// // Updates the current arguments.
// void UpdateArgs(const OBJ &object, ARGS *...args) const;
template <class DER, class OBJ, class... ARGS>
class FeatureLocator : public MetaFeatureFunction<OBJ, ARGS...> {};

// Feature extractor for extracting features from objects of a certain class.
// Template type parameters are as defined for FeatureFunction.
template <class OBJ, class... ARGS>
class FeatureExtractor : public GenericFeatureExtractor {};

}  // namespace chrome_lang_id

#endif  // FEATURE_EXTRACTOR_H_