document_chunker.h | Explore in Territory

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_
#define THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_

#include "third_party/blink/renderer/core/dom/node.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
#include "third_party/blink/renderer/platform/wtf/vector.h"

namespace blink {

class HTMLIFrameElement;

// Returns true if the content of `iframe_element` should be included for
// inner text or document passages.
bool ShouldContentExtractionIncludeIFrame(const HTMLIFrameElement& iframe_element);

// Chunks documents into text passages. Each passage contains either a single
// node of text, or the text of the node and its siblings and descendants if the
// total number of words is less than max_words_per_aggregate_passage. This is
// done by recursively walking the document tree, gathering the content of
// individual text nodes ("segments") and then aggregating these into longer
// strings ("passages"), each containing whitespace-joined segments from zero or
// more siblings and descendants.
class DocumentChunker { … };

}  // namespace blink

#endif  // THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_
chromium/third_party/blink/renderer/modules/content_extraction/document_chunker.h