[go: nahoru, domu]

blob: 0696925c8cbf10dd6d3dce4762703f059872d969 [file] [log] [blame]
jam@chromium.org1b1e9eff2014-05-20 01:56:401// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef PDF_PDFIUM_PDFIUM_PAGE_H_
6#define PDF_PDFIUM_PDFIUM_PAGE_H_
7
Pratish Kumar8f9d6d6a2019-08-29 03:26:298#include <map>
9#include <set>
jam@chromium.org1b1e9eff2014-05-20 01:56:4010#include <string>
11#include <vector>
12
Virender Singh5bee21042019-08-13 07:10:0113#include "base/gtest_prod_util.h"
Henrique Nakashima9d9e0632017-10-06 21:38:1814#include "base/optional.h"
jam@chromium.org1b1e9eff2014-05-20 01:56:4015#include "base/strings/string16.h"
K Moon9a62bf42019-08-07 20:05:3616#include "pdf/page_orientation.h"
Henrique Nakashimafd7edf2a2017-11-29 22:03:4917#include "pdf/pdf_engine.h"
jam@chromium.org1b1e9eff2014-05-20 01:56:4018#include "ppapi/cpp/rect.h"
Tom Sepezb0048f12018-05-14 22:56:4719#include "third_party/pdfium/public/cpp/fpdf_scopers.h"
tsepez350243562015-05-12 01:08:4520#include "third_party/pdfium/public/fpdf_doc.h"
21#include "third_party/pdfium/public/fpdf_formfill.h"
22#include "third_party/pdfium/public/fpdf_text.h"
Henrique Nakashima1a49dbc2018-02-08 21:00:3323#include "ui/gfx/geometry/point_f.h"
jam@chromium.org1b1e9eff2014-05-20 01:56:4024
Benjamin Beaudryf80f4b52019-08-26 17:18:4825struct PP_PrivateAccessibilityTextRunInfo;
26
jam@chromium.org1b1e9eff2014-05-20 01:56:4027namespace chrome_pdf {
28
29class PDFiumEngine;
30
31// Wrapper around a page from the document.
32class PDFiumPage {
33 public:
K Moonb15a6a02019-08-30 01:18:4034 PDFiumPage(PDFiumEngine* engine, int i);
Tom Sepezb0048f12018-05-14 22:56:4735 PDFiumPage(PDFiumPage&& that);
jam@chromium.org1b1e9eff2014-05-20 01:56:4036 ~PDFiumPage();
thestigccb5fc8f2016-01-05 05:32:5137
Lei Zhangad577c622019-08-02 20:35:3238 using IsValidLinkFunction = bool (*)(const std::string& url);
39 static void SetIsValidLinkFunctionForTesting(IsValidLinkFunction function);
40
jam@chromium.org1b1e9eff2014-05-20 01:56:4041 // Unloads the PDFium data for this page from memory.
42 void Unload();
43 // Gets the FPDF_PAGE for this page, loading and parsing it if necessary.
44 FPDF_PAGE GetPage();
jam@chromium.org1b1e9eff2014-05-20 01:56:4045
46 // Returns FPDF_TEXTPAGE for the page, loading and parsing it if necessary.
47 FPDF_TEXTPAGE GetTextPage();
48
Lei Zhangf2aefe22019-08-27 22:08:0249 // See definition of PDFEngine::GetTextRunInfo().
50 base::Optional<PP_PrivateAccessibilityTextRunInfo> GetTextRunInfo(
51 int start_char_index);
dmazzonic3547a32016-06-02 05:47:1552 // Get a unicode character from the page.
53 uint32_t GetCharUnicode(int char_index);
dmazzonid48d9322016-06-13 19:37:4254 // Get the bounds of a character in page pixels.
55 pp::FloatRect GetCharBounds(int char_index);
dmazzonic3547a32016-06-02 05:47:1556
jam@chromium.org1b1e9eff2014-05-20 01:56:4057 enum Area {
58 NONSELECTABLE_AREA,
drgagec32fae262017-06-24 00:17:4959 TEXT_AREA, // Area contains regular, selectable text not
60 // within form fields.
61 WEBLINK_AREA, // Area is a hyperlink.
62 DOCLINK_AREA, // Area is a link to a different part of the same
63 // document.
64 FORM_TEXT_AREA, // Area is a form text field or form combobox text
65 // field.
jam@chromium.org1b1e9eff2014-05-20 01:56:4066 };
67
68 struct LinkTarget {
Henrique Nakashima9d9e0632017-10-06 21:38:1869 LinkTarget();
70 LinkTarget(const LinkTarget& other);
71 ~LinkTarget();
72
73 // Valid for WEBLINK_AREA only.
74 std::string url;
75
76 // Valid for DOCLINK_AREA only.
77 int page;
78 // Valid for DOCLINK_AREA only. From the top of the page.
Henrique Nakashima97f071c2018-01-11 19:56:0279 base::Optional<float> y_in_pixels;
jam@chromium.org1b1e9eff2014-05-20 01:56:4080 };
81
Mansi Awasthi6f4aa4a2019-09-07 05:34:0782 // Given a |link_index|, returns the type of underlying area and the link
83 // target. |target| must be valid. Returns NONSELECTABLE_AREA if
84 // |link_index| is invalid.
85 Area GetLinkTargetAtIndex(int link_index, LinkTarget* target);
86
Henrique Nakashima97f071c2018-01-11 19:56:0287 // Returns the (x, y) position of a destination in page coordinates.
Henrique Nakashima1a49dbc2018-02-08 21:00:3388 base::Optional<gfx::PointF> GetPageXYTarget(FPDF_DEST destination);
Henrique Nakashima97f071c2018-01-11 19:56:0289
90 // Transforms an (x, y) position in page coordinates to screen coordinates.
Henrique Nakashima1a49dbc2018-02-08 21:00:3391 gfx::PointF TransformPageToScreenXY(const gfx::PointF& xy);
Henrique Nakashima9d9e0632017-10-06 21:38:1892
jam@chromium.org1b1e9eff2014-05-20 01:56:4093 // Given a point in the document that's in this page, returns its character
94 // index if it's near a character, and also the type of text.
95 // Target is optional. It will be filled in for WEBLINK_AREA or
96 // DOCLINK_AREA only.
thestig98913ba2017-04-21 19:03:2597 Area GetCharIndex(const pp::Point& point,
K Moon9a62bf42019-08-07 20:05:3698 PageOrientation orientation,
thestig98913ba2017-04-21 19:03:2599 int* char_index,
100 int* form_type,
101 LinkTarget* target);
jam@chromium.org1b1e9eff2014-05-20 01:56:40102
drgagec32fae262017-06-24 00:17:49103 // Converts a form type to its corresponding Area.
104 static Area FormTypeToArea(int form_type);
105
jam@chromium.org1b1e9eff2014-05-20 01:56:40106 // Gets the character at the given index.
107 base::char16 GetCharAtIndex(int index);
108
109 // Gets the number of characters in the page.
110 int GetCharCount();
111
Pratish Kumare70329732019-07-22 18:07:08112 // Given a rectangle in page coordinates, computes the range of continuous
113 // characters which lie inside that rectangle. Returns false without
114 // modifying the out parameters if no character lies inside the rectangle.
115 bool GetUnderlyingTextRangeForRect(const pp::FloatRect& rect,
116 int* start_index,
Lei Zhangcf993012019-08-22 18:24:23117 int* char_len);
Pratish Kumare70329732019-07-22 18:07:08118
jam@chromium.org1b1e9eff2014-05-20 01:56:40119 // Converts from page coordinates to screen coordinates.
120 pp::Rect PageToScreen(const pp::Point& offset,
121 double zoom,
122 double left,
123 double top,
124 double right,
125 double bottom,
K Moon9a62bf42019-08-07 20:05:36126 PageOrientation orientation) const;
jam@chromium.org1b1e9eff2014-05-20 01:56:40127
Henrique Nakashimafd7edf2a2017-11-29 22:03:49128 const PDFEngine::PageFeatures* GetPageFeatures();
129
jam@chromium.org1b1e9eff2014-05-20 01:56:40130 int index() const { return index_; }
K Moonb15a6a02019-08-30 01:18:40131
Jeremy Chinsencfcbaad2019-07-19 17:26:01132 const pp::Rect& rect() const { return rect_; }
jam@chromium.org1b1e9eff2014-05-20 01:56:40133 void set_rect(const pp::Rect& r) { rect_ = r; }
K Moonb15a6a02019-08-30 01:18:40134
135 // Availability is a one-way transition: A page can become available, but it
136 // cannot become unavailable (unless deleted entirely).
jam@chromium.org1b1e9eff2014-05-20 01:56:40137 bool available() const { return available_; }
K Moonb15a6a02019-08-30 01:18:40138 void MarkAvailable() { available_ = true; }
139
jam@chromium.org1b1e9eff2014-05-20 01:56:40140 void set_calculated_links(bool calculated_links) {
thestig98913ba2017-04-21 19:03:25141 calculated_links_ = calculated_links;
jam@chromium.org1b1e9eff2014-05-20 01:56:40142 }
143
Tom Sepezb0048f12018-05-14 22:56:47144 FPDF_PAGE page() const { return page_.get(); }
145 FPDF_TEXTPAGE text_page() const { return text_page_.get(); }
146
jam@chromium.org1b1e9eff2014-05-20 01:56:40147 private:
Virender Singh5bee21042019-08-13 07:10:01148 friend class PDFiumPageLinkTest;
Pratish Kumar7e49da292019-08-21 19:10:30149 friend class PDFiumTestBase;
150
151 FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, TestCalculateImages);
Virender Singh5bee21042019-08-13 07:10:01152 FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, TestLinkGeneration);
153
jam@chromium.org1b1e9eff2014-05-20 01:56:40154 // Returns a link index if the given character index is over a link, or -1
155 // otherwise.
156 int GetLink(int char_index, LinkTarget* target);
jam@chromium.org1b1e9eff2014-05-20 01:56:40157 // Calculate the locations of any links on the page.
158 void CalculateLinks();
Pratish Kumar7e49da292019-08-21 19:10:30159 // Calculate the locations of images on the page.
160 void CalculateImages();
Henrique Nakashima9d9e0632017-10-06 21:38:18161 // Returns link type and fills target associated with a link. Returns
jam@chromium.org1b1e9eff2014-05-20 01:56:40162 // NONSELECTABLE_AREA if link detection failed.
Henrique Nakashima9d9e0632017-10-06 21:38:18163 Area GetLinkTarget(FPDF_LINK link, LinkTarget* target);
164 // Returns link type and fills target associated with a destination. Returns
165 // NONSELECTABLE_AREA if detection failed.
166 Area GetDestinationTarget(FPDF_DEST destination, LinkTarget* target);
167 // Returns link type and fills target associated with a URI action. Returns
168 // NONSELECTABLE_AREA if detection failed.
thestig139fa7482017-06-27 19:51:33169 Area GetURITarget(FPDF_ACTION uri_action, LinkTarget* target) const;
Pratish Kumar860e8832019-08-29 00:13:56170 // Calculates the set of character indices on which text runs need to be
171 // broken for page objects such as links and images.
172 void CalculatePageObjectTextRunBreaks();
jam@chromium.org1b1e9eff2014-05-20 01:56:40173
Pratish Kumar8f9d6d6a2019-08-29 03:26:29174 // Key : Marked content id for the image element as specified in the
175 // struct tree.
176 // Value : Index of image in the |images_| vector.
177 using MarkedContentIdToImageMap = std::map<int, size_t>;
178 // Traverses the entire struct tree of the page recursively and extracts the
179 // alt text from struct tree elements corresponding to the marked content IDs
180 // present in |marked_content_id_image_map|.
181 void PopulateImageAltText(
182 const MarkedContentIdToImageMap& marked_content_id_image_map);
183 // Traverses a struct element and its sub-tree recursively and extracts the
184 // alt text from struct elements corresponding to the marked content IDs
185 // present in |marked_content_id_image_map|. Uses |visited_elements| to guard
186 // against malformed struct trees.
187 void PopulateImageAltTextForStructElement(
188 const MarkedContentIdToImageMap& marked_content_id_image_map,
189 FPDF_STRUCTELEMENT current_element,
190 std::set<FPDF_STRUCTELEMENT>* visited_elements);
191
Artem Stryginfd53f2f2018-07-13 13:21:05192 class ScopedUnloadPreventer {
thestig03ac42d2014-12-20 16:04:00193 public:
Artem Stryginfd53f2f2018-07-13 13:21:05194 explicit ScopedUnloadPreventer(PDFiumPage* page);
195 ~ScopedUnloadPreventer();
thestig03ac42d2014-12-20 16:04:00196
197 private:
198 PDFiumPage* const page_;
199 };
200
jam@chromium.org1b1e9eff2014-05-20 01:56:40201 struct Link {
202 Link();
thestigfa6edbc72016-08-23 08:07:00203 Link(const Link& that);
jam@chromium.org1b1e9eff2014-05-20 01:56:40204 ~Link();
205
Virender Singh5bee21042019-08-13 07:10:01206 // Represents start index of underlying text range. Should be -1 if the link
207 // is not over text.
208 int32_t start_char_index = -1;
209 // Represents the number of characters that the link overlaps with.
210 int32_t char_count = 0;
211 std::vector<pp::Rect> bounding_rects;
212
213 // Valid for links with external urls only.
jam@chromium.org1b1e9eff2014-05-20 01:56:40214 std::string url;
jam@chromium.org1b1e9eff2014-05-20 01:56:40215 };
216
Pratish Kumar7e49da292019-08-21 19:10:30217 // Represents an Image inside the page.
218 struct Image {
219 Image();
220 Image(const Image& other);
221 ~Image();
222
223 pp::Rect bounding_rect;
Pratish Kumar8f9d6d6a2019-08-29 03:26:29224 // Alt text is available only for tagged PDFs.
225 std::string alt_text;
Pratish Kumar7e49da292019-08-21 19:10:30226 };
227
jam@chromium.org1b1e9eff2014-05-20 01:56:40228 PDFiumEngine* engine_;
Tom Sepezb0048f12018-05-14 22:56:47229 ScopedFPDFPage page_;
230 ScopedFPDFTextPage text_page_;
jam@chromium.org1b1e9eff2014-05-20 01:56:40231 int index_;
Artem Stryginfd53f2f2018-07-13 13:21:05232 int preventing_unload_count_ = 0;
jam@chromium.org1b1e9eff2014-05-20 01:56:40233 pp::Rect rect_;
Tom Sepezb0048f12018-05-14 22:56:47234 bool calculated_links_ = false;
jam@chromium.org1b1e9eff2014-05-20 01:56:40235 std::vector<Link> links_;
Pratish Kumar7e49da292019-08-21 19:10:30236 bool calculated_images_ = false;
237 std::vector<Image> images_;
Pratish Kumar860e8832019-08-29 00:13:56238 bool calculated_page_object_text_run_breaks_ = false;
239 // The set of character indices on which text runs need to be broken for page
240 // objects.
241 std::set<int> page_object_text_run_breaks_;
jam@chromium.org1b1e9eff2014-05-20 01:56:40242 bool available_;
Henrique Nakashimafd7edf2a2017-11-29 22:03:49243 PDFEngine::PageFeatures page_features_;
Tom Sepezb0048f12018-05-14 22:56:47244
245 DISALLOW_COPY_AND_ASSIGN(PDFiumPage);
jam@chromium.org1b1e9eff2014-05-20 01:56:40246};
247
K Moon9a62bf42019-08-07 20:05:36248// Converts page orientations to the PDFium equivalents, as defined by
249// FPDF_RenderPage().
250int ToPDFiumRotation(PageOrientation orientation);
251
jam@chromium.org1b1e9eff2014-05-20 01:56:40252} // namespace chrome_pdf
253
254#endif // PDF_PDFIUM_PDFIUM_PAGE_H_