jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 1 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #ifndef PDF_PDFIUM_PDFIUM_PAGE_H_ |
| 6 | #define PDF_PDFIUM_PDFIUM_PAGE_H_ |
| 7 | |
Pratish Kumar | 8f9d6d6a | 2019-08-29 03:26:29 | [diff] [blame] | 8 | #include <map> |
| 9 | #include <set> |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 10 | #include <string> |
| 11 | #include <vector> |
| 12 | |
Virender Singh | 5bee2104 | 2019-08-13 07:10:01 | [diff] [blame] | 13 | #include "base/gtest_prod_util.h" |
Henrique Nakashima | 9d9e063 | 2017-10-06 21:38:18 | [diff] [blame] | 14 | #include "base/optional.h" |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 15 | #include "base/strings/string16.h" |
K Moon | 9a62bf4 | 2019-08-07 20:05:36 | [diff] [blame] | 16 | #include "pdf/page_orientation.h" |
Henrique Nakashima | fd7edf2a | 2017-11-29 22:03:49 | [diff] [blame] | 17 | #include "pdf/pdf_engine.h" |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 18 | #include "ppapi/cpp/rect.h" |
Tom Sepez | b0048f1 | 2018-05-14 22:56:47 | [diff] [blame] | 19 | #include "third_party/pdfium/public/cpp/fpdf_scopers.h" |
tsepez | 35024356 | 2015-05-12 01:08:45 | [diff] [blame] | 20 | #include "third_party/pdfium/public/fpdf_doc.h" |
| 21 | #include "third_party/pdfium/public/fpdf_formfill.h" |
| 22 | #include "third_party/pdfium/public/fpdf_text.h" |
Henrique Nakashima | 1a49dbc | 2018-02-08 21:00:33 | [diff] [blame] | 23 | #include "ui/gfx/geometry/point_f.h" |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 24 | |
Benjamin Beaudry | f80f4b5 | 2019-08-26 17:18:48 | [diff] [blame] | 25 | struct PP_PrivateAccessibilityTextRunInfo; |
| 26 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 27 | namespace chrome_pdf { |
| 28 | |
| 29 | class PDFiumEngine; |
| 30 | |
| 31 | // Wrapper around a page from the document. |
| 32 | class PDFiumPage { |
| 33 | public: |
K Moon | b15a6a0 | 2019-08-30 01:18:40 | [diff] [blame] | 34 | PDFiumPage(PDFiumEngine* engine, int i); |
Tom Sepez | b0048f1 | 2018-05-14 22:56:47 | [diff] [blame] | 35 | PDFiumPage(PDFiumPage&& that); |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 36 | ~PDFiumPage(); |
thestig | ccb5fc8f | 2016-01-05 05:32:51 | [diff] [blame] | 37 | |
Lei Zhang | ad577c62 | 2019-08-02 20:35:32 | [diff] [blame] | 38 | using IsValidLinkFunction = bool (*)(const std::string& url); |
| 39 | static void SetIsValidLinkFunctionForTesting(IsValidLinkFunction function); |
| 40 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 41 | // Unloads the PDFium data for this page from memory. |
| 42 | void Unload(); |
| 43 | // Gets the FPDF_PAGE for this page, loading and parsing it if necessary. |
| 44 | FPDF_PAGE GetPage(); |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 45 | |
| 46 | // Returns FPDF_TEXTPAGE for the page, loading and parsing it if necessary. |
| 47 | FPDF_TEXTPAGE GetTextPage(); |
| 48 | |
Lei Zhang | f2aefe2 | 2019-08-27 22:08:02 | [diff] [blame] | 49 | // See definition of PDFEngine::GetTextRunInfo(). |
| 50 | base::Optional<PP_PrivateAccessibilityTextRunInfo> GetTextRunInfo( |
| 51 | int start_char_index); |
dmazzoni | c3547a3 | 2016-06-02 05:47:15 | [diff] [blame] | 52 | // Get a unicode character from the page. |
| 53 | uint32_t GetCharUnicode(int char_index); |
dmazzoni | d48d932 | 2016-06-13 19:37:42 | [diff] [blame] | 54 | // Get the bounds of a character in page pixels. |
| 55 | pp::FloatRect GetCharBounds(int char_index); |
dmazzoni | c3547a3 | 2016-06-02 05:47:15 | [diff] [blame] | 56 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 57 | enum Area { |
| 58 | NONSELECTABLE_AREA, |
drgage | c32fae26 | 2017-06-24 00:17:49 | [diff] [blame] | 59 | TEXT_AREA, // Area contains regular, selectable text not |
| 60 | // within form fields. |
| 61 | WEBLINK_AREA, // Area is a hyperlink. |
| 62 | DOCLINK_AREA, // Area is a link to a different part of the same |
| 63 | // document. |
| 64 | FORM_TEXT_AREA, // Area is a form text field or form combobox text |
| 65 | // field. |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 66 | }; |
| 67 | |
| 68 | struct LinkTarget { |
Henrique Nakashima | 9d9e063 | 2017-10-06 21:38:18 | [diff] [blame] | 69 | LinkTarget(); |
| 70 | LinkTarget(const LinkTarget& other); |
| 71 | ~LinkTarget(); |
| 72 | |
| 73 | // Valid for WEBLINK_AREA only. |
| 74 | std::string url; |
| 75 | |
| 76 | // Valid for DOCLINK_AREA only. |
| 77 | int page; |
| 78 | // Valid for DOCLINK_AREA only. From the top of the page. |
Henrique Nakashima | 97f071c | 2018-01-11 19:56:02 | [diff] [blame] | 79 | base::Optional<float> y_in_pixels; |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 80 | }; |
| 81 | |
Mansi Awasthi | 6f4aa4a | 2019-09-07 05:34:07 | [diff] [blame^] | 82 | // Given a |link_index|, returns the type of underlying area and the link |
| 83 | // target. |target| must be valid. Returns NONSELECTABLE_AREA if |
| 84 | // |link_index| is invalid. |
| 85 | Area GetLinkTargetAtIndex(int link_index, LinkTarget* target); |
| 86 | |
Henrique Nakashima | 97f071c | 2018-01-11 19:56:02 | [diff] [blame] | 87 | // Returns the (x, y) position of a destination in page coordinates. |
Henrique Nakashima | 1a49dbc | 2018-02-08 21:00:33 | [diff] [blame] | 88 | base::Optional<gfx::PointF> GetPageXYTarget(FPDF_DEST destination); |
Henrique Nakashima | 97f071c | 2018-01-11 19:56:02 | [diff] [blame] | 89 | |
| 90 | // Transforms an (x, y) position in page coordinates to screen coordinates. |
Henrique Nakashima | 1a49dbc | 2018-02-08 21:00:33 | [diff] [blame] | 91 | gfx::PointF TransformPageToScreenXY(const gfx::PointF& xy); |
Henrique Nakashima | 9d9e063 | 2017-10-06 21:38:18 | [diff] [blame] | 92 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 93 | // Given a point in the document that's in this page, returns its character |
| 94 | // index if it's near a character, and also the type of text. |
| 95 | // Target is optional. It will be filled in for WEBLINK_AREA or |
| 96 | // DOCLINK_AREA only. |
thestig | 98913ba | 2017-04-21 19:03:25 | [diff] [blame] | 97 | Area GetCharIndex(const pp::Point& point, |
K Moon | 9a62bf4 | 2019-08-07 20:05:36 | [diff] [blame] | 98 | PageOrientation orientation, |
thestig | 98913ba | 2017-04-21 19:03:25 | [diff] [blame] | 99 | int* char_index, |
| 100 | int* form_type, |
| 101 | LinkTarget* target); |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 102 | |
drgage | c32fae26 | 2017-06-24 00:17:49 | [diff] [blame] | 103 | // Converts a form type to its corresponding Area. |
| 104 | static Area FormTypeToArea(int form_type); |
| 105 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 106 | // Gets the character at the given index. |
| 107 | base::char16 GetCharAtIndex(int index); |
| 108 | |
| 109 | // Gets the number of characters in the page. |
| 110 | int GetCharCount(); |
| 111 | |
Pratish Kumar | e7032973 | 2019-07-22 18:07:08 | [diff] [blame] | 112 | // Given a rectangle in page coordinates, computes the range of continuous |
| 113 | // characters which lie inside that rectangle. Returns false without |
| 114 | // modifying the out parameters if no character lies inside the rectangle. |
| 115 | bool GetUnderlyingTextRangeForRect(const pp::FloatRect& rect, |
| 116 | int* start_index, |
Lei Zhang | cf99301 | 2019-08-22 18:24:23 | [diff] [blame] | 117 | int* char_len); |
Pratish Kumar | e7032973 | 2019-07-22 18:07:08 | [diff] [blame] | 118 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 119 | // Converts from page coordinates to screen coordinates. |
| 120 | pp::Rect PageToScreen(const pp::Point& offset, |
| 121 | double zoom, |
| 122 | double left, |
| 123 | double top, |
| 124 | double right, |
| 125 | double bottom, |
K Moon | 9a62bf4 | 2019-08-07 20:05:36 | [diff] [blame] | 126 | PageOrientation orientation) const; |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 127 | |
Henrique Nakashima | fd7edf2a | 2017-11-29 22:03:49 | [diff] [blame] | 128 | const PDFEngine::PageFeatures* GetPageFeatures(); |
| 129 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 130 | int index() const { return index_; } |
K Moon | b15a6a0 | 2019-08-30 01:18:40 | [diff] [blame] | 131 | |
Jeremy Chinsen | cfcbaad | 2019-07-19 17:26:01 | [diff] [blame] | 132 | const pp::Rect& rect() const { return rect_; } |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 133 | void set_rect(const pp::Rect& r) { rect_ = r; } |
K Moon | b15a6a0 | 2019-08-30 01:18:40 | [diff] [blame] | 134 | |
| 135 | // Availability is a one-way transition: A page can become available, but it |
| 136 | // cannot become unavailable (unless deleted entirely). |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 137 | bool available() const { return available_; } |
K Moon | b15a6a0 | 2019-08-30 01:18:40 | [diff] [blame] | 138 | void MarkAvailable() { available_ = true; } |
| 139 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 140 | void set_calculated_links(bool calculated_links) { |
thestig | 98913ba | 2017-04-21 19:03:25 | [diff] [blame] | 141 | calculated_links_ = calculated_links; |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 142 | } |
| 143 | |
Tom Sepez | b0048f1 | 2018-05-14 22:56:47 | [diff] [blame] | 144 | FPDF_PAGE page() const { return page_.get(); } |
| 145 | FPDF_TEXTPAGE text_page() const { return text_page_.get(); } |
| 146 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 147 | private: |
Virender Singh | 5bee2104 | 2019-08-13 07:10:01 | [diff] [blame] | 148 | friend class PDFiumPageLinkTest; |
Pratish Kumar | 7e49da29 | 2019-08-21 19:10:30 | [diff] [blame] | 149 | friend class PDFiumTestBase; |
| 150 | |
| 151 | FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, TestCalculateImages); |
Virender Singh | 5bee2104 | 2019-08-13 07:10:01 | [diff] [blame] | 152 | FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, TestLinkGeneration); |
| 153 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 154 | // Returns a link index if the given character index is over a link, or -1 |
| 155 | // otherwise. |
| 156 | int GetLink(int char_index, LinkTarget* target); |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 157 | // Calculate the locations of any links on the page. |
| 158 | void CalculateLinks(); |
Pratish Kumar | 7e49da29 | 2019-08-21 19:10:30 | [diff] [blame] | 159 | // Calculate the locations of images on the page. |
| 160 | void CalculateImages(); |
Henrique Nakashima | 9d9e063 | 2017-10-06 21:38:18 | [diff] [blame] | 161 | // Returns link type and fills target associated with a link. Returns |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 162 | // NONSELECTABLE_AREA if link detection failed. |
Henrique Nakashima | 9d9e063 | 2017-10-06 21:38:18 | [diff] [blame] | 163 | Area GetLinkTarget(FPDF_LINK link, LinkTarget* target); |
| 164 | // Returns link type and fills target associated with a destination. Returns |
| 165 | // NONSELECTABLE_AREA if detection failed. |
| 166 | Area GetDestinationTarget(FPDF_DEST destination, LinkTarget* target); |
| 167 | // Returns link type and fills target associated with a URI action. Returns |
| 168 | // NONSELECTABLE_AREA if detection failed. |
thestig | 139fa748 | 2017-06-27 19:51:33 | [diff] [blame] | 169 | Area GetURITarget(FPDF_ACTION uri_action, LinkTarget* target) const; |
Pratish Kumar | 860e883 | 2019-08-29 00:13:56 | [diff] [blame] | 170 | // Calculates the set of character indices on which text runs need to be |
| 171 | // broken for page objects such as links and images. |
| 172 | void CalculatePageObjectTextRunBreaks(); |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 173 | |
Pratish Kumar | 8f9d6d6a | 2019-08-29 03:26:29 | [diff] [blame] | 174 | // Key : Marked content id for the image element as specified in the |
| 175 | // struct tree. |
| 176 | // Value : Index of image in the |images_| vector. |
| 177 | using MarkedContentIdToImageMap = std::map<int, size_t>; |
| 178 | // Traverses the entire struct tree of the page recursively and extracts the |
| 179 | // alt text from struct tree elements corresponding to the marked content IDs |
| 180 | // present in |marked_content_id_image_map|. |
| 181 | void PopulateImageAltText( |
| 182 | const MarkedContentIdToImageMap& marked_content_id_image_map); |
| 183 | // Traverses a struct element and its sub-tree recursively and extracts the |
| 184 | // alt text from struct elements corresponding to the marked content IDs |
| 185 | // present in |marked_content_id_image_map|. Uses |visited_elements| to guard |
| 186 | // against malformed struct trees. |
| 187 | void PopulateImageAltTextForStructElement( |
| 188 | const MarkedContentIdToImageMap& marked_content_id_image_map, |
| 189 | FPDF_STRUCTELEMENT current_element, |
| 190 | std::set<FPDF_STRUCTELEMENT>* visited_elements); |
| 191 | |
Artem Strygin | fd53f2f | 2018-07-13 13:21:05 | [diff] [blame] | 192 | class ScopedUnloadPreventer { |
thestig | 03ac42d | 2014-12-20 16:04:00 | [diff] [blame] | 193 | public: |
Artem Strygin | fd53f2f | 2018-07-13 13:21:05 | [diff] [blame] | 194 | explicit ScopedUnloadPreventer(PDFiumPage* page); |
| 195 | ~ScopedUnloadPreventer(); |
thestig | 03ac42d | 2014-12-20 16:04:00 | [diff] [blame] | 196 | |
| 197 | private: |
| 198 | PDFiumPage* const page_; |
| 199 | }; |
| 200 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 201 | struct Link { |
| 202 | Link(); |
thestig | fa6edbc7 | 2016-08-23 08:07:00 | [diff] [blame] | 203 | Link(const Link& that); |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 204 | ~Link(); |
| 205 | |
Virender Singh | 5bee2104 | 2019-08-13 07:10:01 | [diff] [blame] | 206 | // Represents start index of underlying text range. Should be -1 if the link |
| 207 | // is not over text. |
| 208 | int32_t start_char_index = -1; |
| 209 | // Represents the number of characters that the link overlaps with. |
| 210 | int32_t char_count = 0; |
| 211 | std::vector<pp::Rect> bounding_rects; |
| 212 | |
| 213 | // Valid for links with external urls only. |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 214 | std::string url; |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 215 | }; |
| 216 | |
Pratish Kumar | 7e49da29 | 2019-08-21 19:10:30 | [diff] [blame] | 217 | // Represents an Image inside the page. |
| 218 | struct Image { |
| 219 | Image(); |
| 220 | Image(const Image& other); |
| 221 | ~Image(); |
| 222 | |
| 223 | pp::Rect bounding_rect; |
Pratish Kumar | 8f9d6d6a | 2019-08-29 03:26:29 | [diff] [blame] | 224 | // Alt text is available only for tagged PDFs. |
| 225 | std::string alt_text; |
Pratish Kumar | 7e49da29 | 2019-08-21 19:10:30 | [diff] [blame] | 226 | }; |
| 227 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 228 | PDFiumEngine* engine_; |
Tom Sepez | b0048f1 | 2018-05-14 22:56:47 | [diff] [blame] | 229 | ScopedFPDFPage page_; |
| 230 | ScopedFPDFTextPage text_page_; |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 231 | int index_; |
Artem Strygin | fd53f2f | 2018-07-13 13:21:05 | [diff] [blame] | 232 | int preventing_unload_count_ = 0; |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 233 | pp::Rect rect_; |
Tom Sepez | b0048f1 | 2018-05-14 22:56:47 | [diff] [blame] | 234 | bool calculated_links_ = false; |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 235 | std::vector<Link> links_; |
Pratish Kumar | 7e49da29 | 2019-08-21 19:10:30 | [diff] [blame] | 236 | bool calculated_images_ = false; |
| 237 | std::vector<Image> images_; |
Pratish Kumar | 860e883 | 2019-08-29 00:13:56 | [diff] [blame] | 238 | bool calculated_page_object_text_run_breaks_ = false; |
| 239 | // The set of character indices on which text runs need to be broken for page |
| 240 | // objects. |
| 241 | std::set<int> page_object_text_run_breaks_; |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 242 | bool available_; |
Henrique Nakashima | fd7edf2a | 2017-11-29 22:03:49 | [diff] [blame] | 243 | PDFEngine::PageFeatures page_features_; |
Tom Sepez | b0048f1 | 2018-05-14 22:56:47 | [diff] [blame] | 244 | |
| 245 | DISALLOW_COPY_AND_ASSIGN(PDFiumPage); |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 246 | }; |
| 247 | |
K Moon | 9a62bf4 | 2019-08-07 20:05:36 | [diff] [blame] | 248 | // Converts page orientations to the PDFium equivalents, as defined by |
| 249 | // FPDF_RenderPage(). |
| 250 | int ToPDFiumRotation(PageOrientation orientation); |
| 251 | |
jam@chromium.org | 1b1e9eff | 2014-05-20 01:56:40 | [diff] [blame] | 252 | } // namespace chrome_pdf |
| 253 | |
| 254 | #endif // PDF_PDFIUM_PDFIUM_PAGE_H_ |