// Copyright 2024 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_ #define THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_ #include "third_party/blink/renderer/core/dom/node.h" #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" #include "third_party/blink/renderer/platform/wtf/vector.h" namespace blink { class HTMLIFrameElement; // Returns true if the content of `iframe_element` should be included for // inner text or document passages. bool ShouldContentExtractionIncludeIFrame(const HTMLIFrameElement& iframe_element); // Chunks documents into text passages. Each passage contains either a single // node of text, or the text of the node and its siblings and descendants if the // total number of words is less than max_words_per_aggregate_passage. This is // done by recursively walking the document tree, gathering the content of // individual text nodes ("segments") and then aggregating these into longer // strings ("passages"), each containing whitespace-joined segments from zero or // more siblings and descendants. class DocumentChunker { … }; } // namespace blink #endif // THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_