iubenda Cookie Solution for GDPR - Version 1.14.0-beta4

Version Description

Download this release

Release Info

Developer Facens
Plugin Icon 128x128 iubenda Cookie Solution for GDPR
Version 1.14.0-beta4
Comparing to
See all releases

Code changes from version 1.14.0-beta2 to 1.14.0-beta4

css/admin.css CHANGED
@@ -3,150 +3,43 @@
3
 
4
  width: 90%;
5
  margin: 50px auto;
6
- padding-bottom: 30px;
 
7
 
8
  border-radius: 2px;
9
  background-color: #FFFFFF;
10
  box-shadow: 0 0 5px 0 rgba(0, 0, 0, 0.1)
11
  }
12
- #iubenda-tabs {
13
- display: table;
14
- width: 80%;
15
- margin: 0 auto;
16
- }
17
- .iubenda-tab {
18
- cursor: pointer;
19
-
20
- display: inline-table;
21
- padding: 10px 15px;
22
- padding-top: 0;
23
- margin-right: 10px;
24
-
25
- background-color: #FFFFFF;
26
- border-top: 2px solid #454545;
27
- box-shadow: 0 -2px 3px 0 rgba(5, 5, 5, 0.2);
28
-
29
- color: #454545
30
- }
31
- .iubenda-tab:before, .iubenda-tab::before {
32
- content: "";
33
-
34
- display: table;
35
- margin: auto;
36
- margin-bottom: 8px;
37
-
38
- border-style: solid;
39
- border-width: 5px 5px 0;
40
- border-color: transparent transparent;
41
- }
42
- #iubenda-code-it, #iubenda-code-de, #iubenda-code-ru, #iubenda-code-es, #iubenda-code-pt-br, #iubenda-code-en, #iubenda-code-fr {
43
- display: none;
44
-
45
- width: 100%;
46
-
47
- background-color: #FFFFFF;
48
- box-shadow: 0 0 3px 0 rgba(5, 5, 5, 0.2);
49
- }
50
- #iubenda-information {
51
- width: 100%;
52
- padding-top: 25px;
53
- padding-bottom: 25px;
54
-
55
-
56
- text-indent: 20px;
57
-
58
- background-color: #FFFFFF;
59
- box-shadow: 0 0 3px 0 rgba(5, 5, 5, 0.2);
60
- }
61
- .iubenda-textarea {
62
- display: block;
63
- width: 95%; height: 300px;
64
- margin: 20px auto;
65
-
66
- overflow-y: scroll;
67
-
68
- outline: none;
69
- resize: vertical
70
- }
71
- .iubenda-textarea-single {
72
- width: 100%;
73
- box-shadow: 0 0 3px 0 rgba(5, 5, 5, 0.2);
74
- }
75
- [data-active-tab] {
76
- border-color: #1FAD81;
77
-
78
- color: #1FAD81
79
- }
80
- [data-active-tab]:before, [data-active-tab]::before {
81
- border-color: #1FAD81 transparent
82
- }
83
- #iubenda-logo {
84
- display: table;
85
  }
86
  .iubenda-link {
87
  display: table;
88
- margin: 55px auto;
89
- }
90
- #iubenda-save {
91
- transition: background-color 300ms, box-shadow 300ms;
92
- cursor: pointer;
93
-
94
- display: table;
95
- margin: 20px 0;
96
- padding: 5px 10px;
97
-
98
- background-color: #1FAD81;
99
- border: none;
100
- border-radius: 1px;
101
- box-shadow: 0 0 3px 0 rgba(5, 5, 5, 0.2);
102
-
103
- color: #FFFFFF
104
- }
105
- #iubenda-save:hover {
106
- background-color: #199C71;
107
- box-shadow: 0 0 4px 0 rgba(5, 5, 5, 0.5);
108
- }
109
- #iubenda-saved {
110
- display: table;
111
- margin: 10px 0;
112
- width: 100%; height: 30px;
113
-
114
- background-color: #199C71;
115
- border-radius: 1px;
116
- box-shadow: 0 0 3px 0 rgba(5, 5, 5, 0.2);
117
- }
118
- #iubenda-saved p {
119
- display: table;
120
- margin: 10px;
121
-
122
- color: #FFFFFF
123
  }
124
  .iubenda-text {
125
- display: table;
126
- width: 80%;
127
- margin: 5px auto;
128
-
129
  color: #434149
130
  }
131
- .iubenda-title {
132
- display: table;
 
 
133
  margin-top: 10px;
134
-
135
- font-weight: bold;
136
  }
137
- .iubenda-url {
138
- display: inline-table;
139
  margin-bottom: 10px;
140
  }
141
- #parser_engine label input {
142
- margin-right: 5px
 
 
143
  }
144
- #parser_engine, #par_skip_parsing, #par_iubenda_output_filter {
145
- margin-left: 30px;
146
  }
147
- .wp-admin .iubenda-checkbox-options p label input[type=checkbox], .iubenda-checkbox-options label span {
148
- display: inline-block;
149
- vertical-align: top;
150
- margin-top: 0;
151
- line-height: 1.2;
152
  }
3
 
4
  width: 90%;
5
  margin: 50px auto;
6
+ padding: 50px 6%;
7
+ box-sizing: border-box;
8
 
9
  border-radius: 2px;
10
  background-color: #FFFFFF;
11
  box-shadow: 0 0 5px 0 rgba(0, 0, 0, 0.1)
12
  }
13
+ .iubenda-title {
14
+ margin-top: 10px;
15
+ font-weight: bold;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
17
  .iubenda-link {
18
  display: table;
19
+ margin: 0 auto 40px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  }
21
  .iubenda-text {
22
+ margin: 10px auto;
 
 
 
23
  color: #434149
24
  }
25
+ #iubenda-tabs {
26
+ margin: 30px auto 20px;
27
+ }
28
+ #iub_parser_engine_container {
29
  margin-top: 10px;
 
 
30
  }
31
+ #iub_parser_engine_container > div:not(:last-child) {
 
32
  margin-bottom: 10px;
33
  }
34
+ #iubenda-tabs #contextual-help-back {
35
+ right: 0;
36
+ border-top: 1px solid #e1e1e1;
37
+ border-bottom: 1px solid #e1e1e1;
38
  }
39
+ #iubenda-tabs .help-tab-content {
40
+ margin-right: 0;
41
  }
42
+ #iubenda-tabs .help-tab-content textarea {
43
+ margin-top: 18px;
44
+ width: 100%;
 
 
45
  }
iubenda-cookie-class/iubenda.class.faster.php DELETED
@@ -1,190 +0,0 @@
1
- <?php
2
-
3
- /**
4
- * iubenda.class.php
5
- * version: 2.0.2
6
- * codename: Faster
7
- * @author: Copyright 2015 iubenda
8
- * @license GNU/GPL
9
- * This program is free software: you can redistribute it and/or modify
10
- * it under the terms of the GNU General Public License as published by
11
- * the Free Software Foundation, either version 3 of the License, or
12
- * (at your option) any later version.
13
- *
14
- * This program is distributed in the hope that it will be useful,
15
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- * GNU General Public License for more details.
18
- *
19
- *
20
- * You should have received a copy of the GNU General Public License
21
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
22
- */
23
-
24
- class iubendaFaster
25
- {
26
- /*
27
- Variables
28
- */
29
-
30
- private $getBlack = array
31
- (
32
- array
33
- (
34
- /*
35
- Domains
36
- */
37
-
38
- "platform.twitter.com/widgets.js",
39
- "apis.google.com/js/plusone.js",
40
- "apis.google.com/js/platform.js",
41
- "connect.facebook.net",
42
- "www.youtube.com/iframe_api",
43
- "pagead2.googlesyndication.com/pagead/js/adsbygoogle.js",
44
- "sharethis.com/button/buttons.js",
45
- "addthis.com/js/",
46
-
47
- /*
48
- JavaScript
49
- */
50
-
51
- "window.adsbygoogle"
52
- ),
53
- array
54
- (
55
- "youtube.com",
56
- "platform.twitter.com",
57
- "www.facebook.com/plugins/like.php",
58
- "www.facebook.com/plugins/likebox.php",
59
- "apis.google.com",
60
- "www.google.com/maps/embed/",
61
- "player.vimeo.com/video",
62
- "maps.google.it/maps",
63
- "www.google.com/maps/embed"
64
- )
65
- );
66
-
67
- /**/
68
-
69
- private $getBlank = "//cdn.iubenda.com/cookie_solution/empty.html";
70
-
71
- private $getClass = array("_iub_cs_activate", "_iub_cs_activate-inline");
72
-
73
- /*
74
- Methods
75
- */
76
-
77
- public function isBlack($offender, $blacklist)
78
- {
79
- /*
80
- Check if a string is in the black list.
81
- */
82
-
83
- if(empty($offender) || empty($blacklist)) {
84
-
85
- return false;
86
-
87
- }
88
-
89
- /**/
90
-
91
- foreach($blacklist as $black)
92
- {
93
- if(strpos($offender, $black) !== false) {
94
-
95
- return true;
96
-
97
- }
98
- }
99
-
100
- /**/
101
-
102
- return false;
103
- }
104
-
105
- /**/
106
-
107
- public function isParse($offender)
108
- {
109
- /*
110
- Parse the entrie document and search for black elements.
111
- */
112
-
113
- libxml_use_internal_errors(true);
114
-
115
- /**/
116
-
117
- $src = "";
118
-
119
- $blank = $this -> getBlank;
120
- $class = $this -> getClass;
121
-
122
- $list_1 = $this -> getBlack[0];
123
- $list_2 = $this -> getBlack[1];
124
-
125
- $document = new DOMDocument();
126
-
127
- /**/
128
-
129
- $document -> formatOutput = true;
130
- $document -> preserveWhiteSpace = false;
131
-
132
- /**/
133
-
134
- $document -> loadHTML($offender);
135
-
136
- /**/
137
-
138
- $scripts = $document -> getElementsByTagName("script");
139
- $iframes = $document -> getElementsByTagName("iframe");
140
-
141
- /*
142
- Parse the founded elements and check who is in black.
143
- */
144
-
145
- foreach($scripts as $script)
146
- {
147
- $src = $script -> getAttribute("src");
148
-
149
- /**/
150
-
151
- if($this -> isBlack($src, $list_1))
152
- {
153
- $script -> setAttribute("type", "text/plain");
154
- $script -> setAttribute("class", $script -> getAttribute("class")." ".$class[0]);
155
- }
156
- elseif($this -> isBlack($script -> nodeValue, $list_1))
157
- {
158
- $script -> setAttribute("type", "text/plain");
159
- $script -> setAttribute("class", $script -> getAttribute("class")." ".$class[1]);
160
- }
161
- }
162
- foreach($iframes as $iframe) {
163
-
164
- $src = $iframe -> getAttribute("src");
165
-
166
- /**/
167
-
168
- if($this -> isBlack($src, $list_2))
169
- {
170
- $iframe -> setAttribute("src", $blank);
171
- $iframe -> setAttribute("suppressedsrc", $src);
172
- $iframe -> setAttribute("class", $iframe -> getAttribute("class")." ".$class[0]);
173
- }
174
- }
175
-
176
- /**/
177
-
178
- $offender = $document -> saveHTML();
179
-
180
- /**/
181
-
182
- libxml_use_internal_errors(false);
183
-
184
- /**/
185
-
186
- return $offender;
187
- }
188
- }
189
-
190
- ?>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
iubenda-cookie-class/iubenda.class.php DELETED
@@ -1,377 +0,0 @@
1
- <?php
2
- /**
3
- * iubenda.class.php
4
- * version: 1.0.2
5
- * @author: Copyright 2015 iubenda
6
- * @license GNU/GPL
7
- * This program is free software: you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation, either version 3 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- *
18
- * You should have received a copy of the GNU General Public License
19
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
20
- */
21
-
22
- class iubendaPage {
23
-
24
- const IUB_REGEX_PATTERN = '/<!--\s*IUB_COOKIE_POLICY_START\s*-->(.*?)<!--\s*IUB_COOKIE_POLICY_END\s*-->/sU';
25
-
26
- public $auto_script_tags = array(
27
- 'platform.twitter.com/widgets.js',
28
- 'apis.google.com/js/plusone.js',
29
- 'apis.google.com/js/platform.js',
30
- 'connect.facebook.net',
31
- 'www.youtube.com/iframe_api',
32
- 'pagead2.googlesyndication.com/pagead/js/adsbygoogle.js',
33
- 'sharethis.com/button/buttons.js',
34
- 'addthis.com/js/',
35
- 'window.adsbygoogle'
36
- );
37
-
38
- public $auto_iframe_tags = array(
39
- 'youtube.com',
40
- 'platform.twitter.com',
41
- 'www.facebook.com/plugins/like.php',
42
- 'www.facebook.com/plugins/likebox.php',
43
- 'apis.google.com',
44
- 'www.google.com/maps/embed/',
45
- 'player.vimeo.com/video',
46
- 'maps.google.it/maps',
47
- 'www.google.com/maps/embed',
48
- 'window.adsbygoogle'
49
- );
50
-
51
- public $iub_comments_detected = array();
52
- public $iframe_detected = array();
53
- public $iframe_converted = array();
54
- public $scripts_detected = array();
55
- public $scripts_inline_detected = array();
56
- public $scripts_inline_converted = array();
57
- public $scripts_converted = array();
58
-
59
-
60
- /*
61
- construct: the whole HTML output of the page
62
- */
63
- public function __construct($content_page){
64
- $this->original_content_page = $content_page;
65
- $this->content_page = $content_page;
66
- }
67
-
68
- /*
69
- print iubenda banner, parameter: the script code of iubenda to print the banner
70
- */
71
- public function print_banner($banner){
72
- return $banner.= "\n
73
- <script>
74
- var iCallback = function(){};
75
-
76
- if('callback' in _iub.csConfiguration) {
77
- if('onConsentGiven' in _iub.csConfiguration.callback) iCallback = _iub.csConfiguration.callback.onConsentGiven;
78
-
79
- _iub.csConfiguration.callback.onConsentGiven = function()
80
- {
81
- iCallback();
82
-
83
- /*
84
- * Separator
85
- */
86
-
87
- jQuery('noscript._no_script_iub').each(function (a, b) { var el = jQuery(b); el.after(el.html()); });
88
- };
89
- };
90
- </script>";
91
- }
92
-
93
- /*
94
- Static, detect bot & crawler
95
- */
96
- static function bot_detected() {
97
- return (isset($_SERVER['HTTP_USER_AGENT']) && preg_match('/bot|crawl|slurp|spider|google|yahoo/i', $_SERVER['HTTP_USER_AGENT']));
98
- }
99
-
100
- /*
101
- Static, utility function: Return true if the user has already given consent on the page
102
- */
103
- static function consent_given(){
104
- foreach($_COOKIE as $key => $value){
105
- if(Page::strpos_array($key, array('_iub_cs-s', '_iub_cs'))){
106
- return true;
107
- }
108
- }
109
- return false;
110
- }
111
- /*
112
- Static, utility function: strpos for array
113
- */
114
- static function strpos_array($haystack, $needle){
115
- if(is_array($needle)){
116
- foreach($needle as $need){
117
- if(strpos($haystack, $need) !== false){
118
- return true;
119
- }
120
- }
121
- }else{
122
- if(strpos($haystack, $need) !== false) {
123
- return true;
124
- }
125
- }
126
- return false;
127
- }
128
-
129
-
130
- /* Convert scripts, iframe and other code inside IUBENDAs comment in text/plain to not generate cookies */
131
- public function create_tags($html){
132
-
133
- $elements = $html->find("*");
134
- $js = '';
135
-
136
- if(is_array($elements)){
137
- $count = count($elements);
138
- for($j=0; $j<$count; $j++){
139
- $e = $elements[$j];
140
- switch($e->tag){
141
- case 'script':
142
- $class = $e->class;
143
- $e->class = $class . ' _iub_cs_activate';
144
- $e->type = 'text/plain';
145
- $js.= $e->outertext;
146
- break;
147
-
148
- case 'iframe':
149
- $new_src = "//cdn.iubenda.com/cookie_solution/empty.html";
150
- $class = $e->class;
151
- $e->suppressedsrc = $e->src;
152
- $e->src = $new_src;
153
- $e->class = $class . ' _iub_cs_activate';
154
- $js.= $e->outertext;
155
- break;
156
-
157
- default:
158
- $js = $html;
159
- break;
160
- }
161
- }
162
- }
163
- return $js;
164
- }
165
-
166
- /* Parse all IUBENDAs comment and convert the code inside with create_tags method */
167
- public function parse_iubenda_comments(){
168
- preg_match_all(self::IUB_REGEX_PATTERN, $this->content_page, $scripts);
169
- if(is_array($scripts[1])){
170
- $count = count($scripts[1]);
171
- $js_scripts = array();
172
- for($j=0; $j<$count; $j++){
173
- $this->iub_comments_detected[] = $scripts[1][$j];
174
- $html = str_get_html($scripts[1][$j], $lowercase=true, $forceTagsClosed=true, $stripRN=false);
175
- $js_scripts[] = $this->create_tags($html);
176
- }
177
-
178
- if(is_array($scripts[1]) && is_array($js_scripts)){
179
- if(count($scripts[1]) >= 1 && count($js_scripts) >= 1){
180
- $this->content_page = strtr($this->content_page, array_combine($scripts[1], $js_scripts));
181
- }
182
- }
183
- }
184
- }
185
-
186
- /* Parse automatically all the scripts in the page and converts it in text/plain
187
- if src or the whole output has inside one of the elements in $auto_script_tags array */
188
- public function parse_scripts(){
189
-
190
- $html = str_get_html($this->content_page, $lowercase=true, $forceTagsClosed=true, $stripRN=false);
191
- if(is_object($html)){
192
- $scripts = $html->find("script");
193
- if(is_array($scripts)){
194
- $count = count($scripts);
195
- for($j=0; $j<$count; $j++){
196
- $s = $scripts[$j];
197
- if(!empty($s->innertext)){
198
- $this->scripts_detected[] = $s->innertext;
199
- if (Page::strpos_array($s->innertext, $this->auto_script_tags) !== false) {
200
- $class = $s->class;
201
- $s->class = $class . ' _iub_cs_activate-inline';
202
- $s->type = 'text/plain';
203
- $this->scripts_converted[] = $s->innertext;
204
- }
205
- }else{
206
- $src = $s->src;
207
- if($src){
208
- $this->scripts_inline_detected[] = $src;
209
- if (Page::strpos_array($src, $this->auto_script_tags) !== false) {
210
- $class = $s->class;
211
- $s->class = $class . ' _iub_cs_activate';
212
- $s->type = 'text/plain';
213
- $this->scripts_inline_converted[] = $src;
214
- }
215
- }
216
- }
217
- }
218
- }
219
-
220
- /*
221
- * AdSense check by Peste Vasile Alexandru, AdSense here
222
- */
223
-
224
- $ad_found = false;
225
-
226
- while(preg_match("#google_ad_client =(.*?);#i", $html))
227
- {
228
- $ad_found = true;
229
- $ad_client = null;
230
- $ad_slot = null;
231
- $ad_width = null;
232
- $ad_height = null;
233
- $ad_block = null;
234
-
235
- /**/
236
-
237
- preg_match("#google_ad_client =(.*?);#i", $html, $ad_client);
238
- preg_match("#google_ad_slot =(.*?);#i", $html, $ad_slot);
239
- preg_match("#google_ad_width =(.*?);#i", $html, $ad_width);
240
- preg_match("#google_ad_height =(.*?);#i", $html, $ad_height);
241
-
242
- /**/
243
-
244
- $html = preg_replace("#google_ad_client =(.*?);#i", "", $html, 1);
245
- $html = preg_replace("#google_ad_slot =(.*?);#i", "", $html, 1);
246
- $html = preg_replace("#google_ad_width =(.*?);#i", "", $html, 1);
247
- $html = preg_replace("#google_ad_height =(.*?);#i", "", $html, 1);
248
-
249
- /**/
250
-
251
- $ad_client = trim($ad_client[1]);
252
- $ad_slot = trim($ad_slot[1]);
253
- $ad_width = trim($ad_width[1]);
254
- $ad_height = trim($ad_height[1]);
255
-
256
- /**/
257
-
258
- $ad_class = 'class="_iub_cs_activate_google_ads"';
259
- $ad_style = 'style="width:'.$ad_width.'px; height:'.$ad_height.'px;"';
260
-
261
- $ad_client = 'data-client='.$ad_client;
262
- $ad_slot = 'data-slot='.$ad_slot;
263
- $ad_width = 'data-width="'.$ad_width.'"';
264
- $ad_height = 'data-height="'.$ad_height.'"';
265
-
266
- /**/
267
-
268
- $ad_block = "<div $ad_style $ad_class $ad_width $ad_height $ad_slot $ad_client></div>";
269
-
270
- /**/
271
-
272
- $html = preg_replace('#(<[^>]+) src="//pagead2.googlesyndication.com/pagead/show_ads.js"(.*?)</script>#i', $ad_block, $html, 1);
273
- }
274
-
275
- /**/
276
-
277
- if($ad_found)
278
- {
279
- $adsense_callback =
280
- "
281
- <script>
282
- function iubenda_adsense_unblock(){
283
- var t = 1;
284
- jQuery('._iub_cs_activate_google_ads').each(function() {
285
- var banner = jQuery(this);
286
- setTimeout(function(){
287
- var client = banner.data('client');
288
- var slot = banner.data('slot');
289
- var width = banner.data('width');
290
- var height = banner.data('height');
291
- var adsense_script = '<scr'+'ipt>'
292
- + 'google_ad_client = ".chr(34)."'+client+'".chr(34).";'
293
- + 'google_ad_slot = '+slot+';'
294
- + 'google_ad_width = '+width+';'
295
- + 'google_ad_height = '+height+';'
296
- + '</scr'+'ipt>';
297
- var script = document.createElement('script');
298
- var ads = document.createElement('ads');
299
- var w = document.write;
300
- script.setAttribute('type', 'text/javascript');
301
- script.setAttribute('src', 'http://pagead2.googlesyndication.com/pagead/show_ads.js');
302
- document.write = (function(params) {
303
- ads.innerHTML = params;
304
- document.write = w;
305
- });
306
- banner.html(adsense_script).append(ads).append(script);
307
- }, t);
308
- t += 300;
309
- });
310
- }
311
- if('callback' in _iub.csConfiguration) {
312
- _iub.csConfiguration.callback.onConsentGiven = iubenda_adsense_unblock;
313
- }
314
- else
315
- {
316
- _iub.csConfiguration.callback = {};
317
-
318
- _iub.csConfiguration.callback.onConsentGiven = iubenda_adsense_unblock;
319
- }
320
- </script>
321
- ";
322
-
323
- $html = str_replace("</body>", $adsense_callback."</body>", $html);
324
- }
325
-
326
- /**/
327
-
328
- $this->content_page = $html;
329
- }
330
- }
331
-
332
- /* Parse automatically all the iframe in the page and change the src to suppressedsrc
333
- if src has inside one of the elements in $auto_iframe_tags array */
334
- public function parse_iframe(){
335
- $html = str_get_html($this->content_page, $lowercase=true, $forceTagsClosed=true, $stripRN=false);
336
- if(is_object($html)){
337
- $iframes = $html->find("iframe");
338
- if(is_array($iframes)){
339
- $count = count($iframes);
340
- for($j=0; $j<$count; $j++){
341
- $i = $iframes[$j];
342
- $src = $i->src;
343
- $this->iframe_detected[] = $src;
344
- if (Page::strpos_array($src, $this->auto_iframe_tags) !== false){
345
- $new_src = "//cdn.iubenda.com/cookie_solution/empty.html";
346
- $class = $i->class;
347
- $i->suppressedsrc = $src;
348
- $i->src = $new_src;
349
- $i->class = $class . ' _iub_cs_activate';
350
- $this->iframe_converted[] = $src;
351
- }
352
- }
353
- }
354
- $this->content_page = $html;
355
- }
356
- }
357
-
358
- /*
359
- Call three methods to parse the page, iubendas comment, scripts + iframe
360
- */
361
- public function parse()
362
- {
363
- $this->parse_iubenda_comments();
364
- $this->parse_scripts();
365
- $this->parse_iframe();
366
- }
367
-
368
- /*
369
- Return the final page to output
370
- */
371
- public function get_converted_page(){
372
- return $this->content_page;
373
- }
374
-
375
- }
376
-
377
- ?>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
iubenda-cookie-class/simple_html_dom.php DELETED
@@ -1,1734 +0,0 @@
1
- <?php
2
- /**
3
- * Website: http://sourceforge.net/projects/simplehtmldom/
4
- * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
5
- * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6
- * Contributions by:
7
- * Yousuke Kumakura (Attribute filters)
8
- * Vadim Voituk (Negative indexes supports of "find" method)
9
- * Antcs (Constructor with automatically load contents either text or file/url)
10
- *
11
- * all affected sections have comments starting with "PaperG"
12
- *
13
- * Paperg - Added case insensitive testing of the value of the selector.
14
- * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
15
- * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
16
- * it will almost always be smaller by some amount.
17
- * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
18
- * but for most purposes, it's a really good estimation.
19
- * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
20
- * Allow the user to tell us how much they trust the html.
21
- * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
22
- * This allows for us to find tags based on the text they contain.
23
- * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
24
- * Paperg: added parse_charset so that we know about the character set of the source document.
25
- * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
26
- * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
27
- *
28
- * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
29
- * PaperG (John Schlick) Added get_display_size for "IMG" tags.
30
- *
31
- * Licensed under The MIT License
32
- * Redistributions of files must retain the above copyright notice.
33
- *
34
- * @author S.C. Chen <me578022@gmail.com>
35
- * @author John Schlick
36
- * @author Rus Carroll
37
- * @version 1.5 ($Rev: 210 $)
38
- * @package PlaceLocalInclude
39
- * @subpackage simple_html_dom
40
- */
41
-
42
- /**
43
- * All of the Defines for the classes below.
44
- * @author S.C. Chen <me578022@gmail.com>
45
- */
46
- define('HDOM_TYPE_ELEMENT', 1);
47
- define('HDOM_TYPE_COMMENT', 2);
48
- define('HDOM_TYPE_TEXT', 3);
49
- define('HDOM_TYPE_ENDTAG', 4);
50
- define('HDOM_TYPE_ROOT', 5);
51
- define('HDOM_TYPE_UNKNOWN', 6);
52
- define('HDOM_QUOTE_DOUBLE', 0);
53
- define('HDOM_QUOTE_SINGLE', 1);
54
- define('HDOM_QUOTE_NO', 3);
55
- define('HDOM_INFO_BEGIN', 0);
56
- define('HDOM_INFO_END', 1);
57
- define('HDOM_INFO_QUOTE', 2);
58
- define('HDOM_INFO_SPACE', 3);
59
- define('HDOM_INFO_TEXT', 4);
60
- define('HDOM_INFO_INNER', 5);
61
- define('HDOM_INFO_OUTER', 6);
62
- define('HDOM_INFO_ENDSPACE',7);
63
- // helper functions
64
- // -----------------------------------------------------------------------------
65
- // get html dom from file
66
- // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
67
- function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $stripRN=true)
68
- {
69
- // We DO force the tags to be terminated.
70
- $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $stripRN, $defaultBRText, $defaultSpanText);
71
- // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
72
- $contents = file_get_contents($url, $use_include_path, $context, $offset);
73
- // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
74
- //$contents = retrieve_url_contents($url);
75
- if (empty($contents))
76
- {
77
- return false;
78
- }
79
- // The second parameter can force the selectors to all be lowercase.
80
- $dom->load($contents, $lowercase, $stripRN);
81
- return $dom;
82
- }
83
-
84
- // get html dom from string
85
- function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $stripRN=true)
86
- {
87
- $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $stripRN);
88
- if (empty($str))
89
- {
90
- $dom->clear();
91
- return false;
92
- }
93
- $dom->load($str, $lowercase, $stripRN);
94
- return $dom;
95
- }
96
-
97
- // dump html dom tree
98
- function dump_html_tree($node, $show_attr=true, $deep=0)
99
- {
100
- $node->dump($node);
101
- }
102
-
103
-
104
- /**
105
- * simple html dom node
106
- * PaperG - added ability for "find" routine to lowercase the value of the selector.
107
- * PaperG - added $tag_start to track the start position of the tag in the total byte index
108
- *
109
- * @package PlaceLocalInclude
110
- */
111
- class simple_html_dom_node
112
- {
113
- public $nodetype = HDOM_TYPE_TEXT;
114
- public $tag = 'text';
115
- public $attr = array();
116
- public $children = array();
117
- public $nodes = array();
118
- public $parent = null;
119
- // The "info" array - see HDOM_INFO_... for what each element contains.
120
- public $_ = array();
121
- public $tag_start = 0;
122
- private $dom = null;
123
-
124
- function __construct($dom)
125
- {
126
- $this->dom = $dom;
127
- $dom->nodes[] = $this;
128
- }
129
-
130
- function __destruct()
131
- {
132
- $this->clear();
133
- }
134
-
135
- function __toString()
136
- {
137
- return $this->outertext();
138
- }
139
-
140
- // clean up memory due to php5 circular references memory leak...
141
- function clear()
142
- {
143
- $this->dom = null;
144
- $this->nodes = null;
145
- $this->parent = null;
146
- $this->children = null;
147
- }
148
-
149
- // dump node's tree
150
- function dump($show_attr=true, $deep=0)
151
- {
152
- $lead = str_repeat(' ', $deep);
153
-
154
- echo $lead.$this->tag;
155
- if ($show_attr && count($this->attr)>0)
156
- {
157
- echo '(';
158
- foreach ($this->attr as $k=>$v)
159
- echo "[$k]=>\"".$this->$k.'", ';
160
- echo ')';
161
- }
162
- echo "\n";
163
-
164
- if ($this->nodes)
165
- {
166
- foreach ($this->nodes as $c)
167
- {
168
- $c->dump($show_attr, $deep+1);
169
- }
170
- }
171
- }
172
-
173
-
174
- // Debugging function to dump a single dom node with a bunch of information about it.
175
- function dump_node($echo=true)
176
- {
177
-
178
- $string = $this->tag;
179
- if (count($this->attr)>0)
180
- {
181
- $string .= '(';
182
- foreach ($this->attr as $k=>$v)
183
- {
184
- $string .= "[$k]=>\"".$this->$k.'", ';
185
- }
186
- $string .= ')';
187
- }
188
- if (count($this->_)>0)
189
- {
190
- $string .= ' $_ (';
191
- foreach ($this->_ as $k=>$v)
192
- {
193
- if (is_array($v))
194
- {
195
- $string .= "[$k]=>(";
196
- foreach ($v as $k2=>$v2)
197
- {
198
- $string .= "[$k2]=>\"".$v2.'", ';
199
- }
200
- $string .= ")";
201
- } else {
202
- $string .= "[$k]=>\"".$v.'", ';
203
- }
204
- }
205
- $string .= ")";
206
- }
207
-
208
- if (isset($this->text))
209
- {
210
- $string .= " text: (" . $this->text . ")";
211
- }
212
-
213
- $string .= " HDOM_INNER_INFO: '";
214
- if (isset($node->_[HDOM_INFO_INNER]))
215
- {
216
- $string .= $node->_[HDOM_INFO_INNER] . "'";
217
- }
218
- else
219
- {
220
- $string .= ' NULL ';
221
- }
222
-
223
- $string .= " children: " . count($this->children);
224
- $string .= " nodes: " . count($this->nodes);
225
- $string .= " tag_start: " . $this->tag_start;
226
- $string .= "\n";
227
-
228
- if ($echo)
229
- {
230
- echo $string;
231
- return;
232
- }
233
- else
234
- {
235
- return $string;
236
- }
237
- }
238
-
239
- // returns the parent of node
240
- // If a node is passed in, it will reset the parent of the current node to that one.
241
- function parent($parent=null)
242
- {
243
- // I am SURE that this doesn't work properly.
244
- // It fails to unset the current node from it's current parents nodes or children list first.
245
- if ($parent !== null)
246
- {
247
- $this->parent = $parent;
248
- $this->parent->nodes[] = $this;
249
- $this->parent->children[] = $this;
250
- }
251
-
252
- return $this->parent;
253
- }
254
-
255
- // verify that node has children
256
- function has_child()
257
- {
258
- return !empty($this->children);
259
- }
260
-
261
- // returns children of node
262
- function children($idx=-1)
263
- {
264
- if ($idx===-1)
265
- {
266
- return $this->children;
267
- }
268
- if (isset($this->children[$idx]))
269
- {
270
- return $this->children[$idx];
271
- }
272
- return null;
273
- }
274
-
275
- // returns the first child of node
276
- function first_child()
277
- {
278
- if (count($this->children)>0)
279
- {
280
- return $this->children[0];
281
- }
282
- return null;
283
- }
284
-
285
- // returns the last child of node
286
- function last_child()
287
- {
288
- if (($count=count($this->children))>0)
289
- {
290
- return $this->children[$count-1];
291
- }
292
- return null;
293
- }
294
-
295
- // returns the next sibling of node
296
- function next_sibling()
297
- {
298
- if ($this->parent===null)
299
- {
300
- return null;
301
- }
302
-
303
- $idx = 0;
304
- $count = count($this->parent->children);
305
- while ($idx<$count && $this!==$this->parent->children[$idx])
306
- {
307
- ++$idx;
308
- }
309
- if (++$idx>=$count)
310
- {
311
- return null;
312
- }
313
- return $this->parent->children[$idx];
314
- }
315
-
316
- // returns the previous sibling of node
317
- function prev_sibling()
318
- {
319
- if ($this->parent===null) return null;
320
- $idx = 0;
321
- $count = count($this->parent->children);
322
- while ($idx<$count && $this!==$this->parent->children[$idx])
323
- ++$idx;
324
- if (--$idx<0) return null;
325
- return $this->parent->children[$idx];
326
- }
327
-
328
- // function to locate a specific ancestor tag in the path to the root.
329
- function find_ancestor_tag($tag)
330
- {
331
- global $debug_object;
332
- if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
333
-
334
- // Start by including ourselves in the comparison.
335
- $returnDom = $this;
336
-
337
- while (!is_null($returnDom))
338
- {
339
- if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
340
-
341
- if ($returnDom->tag == $tag)
342
- {
343
- break;
344
- }
345
- $returnDom = $returnDom->parent;
346
- }
347
- return $returnDom;
348
- }
349
-
350
- // get dom node's inner html
351
- function innertext()
352
- {
353
- if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
354
- if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
355
-
356
- $ret = '';
357
- foreach ($this->nodes as $n)
358
- $ret .= $n->outertext();
359
- return $ret;
360
- }
361
-
362
- // get dom node's outer text (with tag)
363
- function outertext()
364
- {
365
- global $debug_object;
366
- if (is_object($debug_object))
367
- {
368
- $text = '';
369
- if ($this->tag == 'text')
370
- {
371
- if (!empty($this->text))
372
- {
373
- $text = " with text: " . $this->text;
374
- }
375
- }
376
- $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
377
- }
378
-
379
- if ($this->tag==='root') return $this->innertext();
380
-
381
- // trigger callback
382
- if ($this->dom && $this->dom->callback!==null)
383
- {
384
- call_user_func_array($this->dom->callback, array($this));
385
- }
386
-
387
- if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
388
- if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
389
-
390
- // render begin tag
391
- if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
392
- {
393
- $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
394
- } else {
395
- $ret = "";
396
- }
397
-
398
- // render inner text
399
- if (isset($this->_[HDOM_INFO_INNER]))
400
- {
401
- // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
402
- if ($this->tag != "br")
403
- {
404
- $ret .= $this->_[HDOM_INFO_INNER];
405
- }
406
- } else {
407
- if ($this->nodes)
408
- {
409
- foreach ($this->nodes as $n)
410
- {
411
- $ret .= $this->convert_text($n->outertext());
412
- }
413
- }
414
- }
415
-
416
- // render end tag
417
- if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
418
- $ret .= '</'.$this->tag.'>';
419
- return $ret;
420
- }
421
-
422
- // get dom node's plain text
423
- function text()
424
- {
425
- if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
426
- switch ($this->nodetype)
427
- {
428
- case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
429
- case HDOM_TYPE_COMMENT: return '';
430
- case HDOM_TYPE_UNKNOWN: return '';
431
- }
432
- if (strcasecmp($this->tag, 'script')===0) return '';
433
- if (strcasecmp($this->tag, 'style')===0) return '';
434
-
435
- $ret = '';
436
- // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
437
- // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
438
- // WHY is this happening?
439
- if (!is_null($this->nodes))
440
- {
441
- foreach ($this->nodes as $n)
442
- {
443
- $ret .= $this->convert_text($n->text());
444
- }
445
-
446
- // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
447
- if ($this->tag == "span")
448
- {
449
- $ret .= $this->dom->default_span_text;
450
- }
451
-
452
-
453
- }
454
- return $ret;
455
- }
456
-
457
- function xmltext()
458
- {
459
- $ret = $this->innertext();
460
- $ret = str_ireplace('<![CDATA[', '', $ret);
461
- $ret = str_replace(']]>', '', $ret);
462
- return $ret;
463
- }
464
-
465
- // build node's text with tag
466
- function makeup()
467
- {
468
- // text, comment, unknown
469
- if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
470
-
471
- $ret = '<'.$this->tag;
472
- $i = -1;
473
-
474
- foreach ($this->attr as $key=>$val)
475
- {
476
- ++$i;
477
-
478
- // skip removed attribute
479
- if ($val===null || $val===false)
480
- continue;
481
-
482
- $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
483
- //no value attr: nowrap, checked selected...
484
- if ($val===true)
485
- $ret .= $key;
486
- else {
487
- switch ($this->_[HDOM_INFO_QUOTE][$i])
488
- {
489
- case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
490
- case HDOM_QUOTE_SINGLE: $quote = '\''; break;
491
- default: $quote = '';
492
- }
493
- $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
494
- }
495
- }
496
- $ret = $this->dom->restore_noise($ret);
497
- return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
498
- }
499
-
500
- // find elements by css selector
501
- //PaperG - added ability for find to lowercase the value of the selector.
502
- function find($selector, $idx=null, $lowercase=false)
503
- {
504
- $selectors = $this->parse_selector($selector);
505
- if (($count=count($selectors))===0) return array();
506
- $found_keys = array();
507
-
508
- // find each selector
509
- for ($c=0; $c<$count; ++$c)
510
- {
511
- // The change on the below line was documented on the sourceforge code tracker id 2788009
512
- // used to be: if (($levle=count($selectors[0]))===0) return array();
513
- if (($levle=count($selectors[$c]))===0) return array();
514
- if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
515
-
516
- $head = array($this->_[HDOM_INFO_BEGIN]=>1);
517
-
518
- // handle descendant selectors, no recursive!
519
- for ($l=0; $l<$levle; ++$l)
520
- {
521
- $ret = array();
522
- foreach ($head as $k=>$v)
523
- {
524
- $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
525
- //PaperG - Pass this optional parameter on to the seek function.
526
- $n->seek($selectors[$c][$l], $ret, $lowercase);
527
- }
528
- $head = $ret;
529
- }
530
-
531
- foreach ($head as $k=>$v)
532
- {
533
- if (!isset($found_keys[$k]))
534
- {
535
- $found_keys[$k] = 1;
536
- }
537
- }
538
- }
539
-
540
- // sort keys
541
- ksort($found_keys);
542
-
543
- $found = array();
544
- foreach ($found_keys as $k=>$v)
545
- $found[] = $this->dom->nodes[$k];
546
-
547
- // return nth-element or array
548
- if (is_null($idx)) return $found;
549
- else if ($idx<0) $idx = count($found) + $idx;
550
- return (isset($found[$idx])) ? $found[$idx] : null;
551
- }
552
-
553
- // seek for given conditions
554
- // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
555
- protected function seek($selector, &$ret, $lowercase=false)
556
- {
557
- global $debug_object;
558
- if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
559
-
560
- list($tag, $key, $val, $exp, $no_key) = $selector;
561
-
562
- // xpath index
563
- if ($tag && $key && is_numeric($key))
564
- {
565
- $count = 0;
566
- foreach ($this->children as $c)
567
- {
568
- if ($tag==='*' || $tag===$c->tag) {
569
- if (++$count==$key) {
570
- $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
571
- return;
572
- }
573
- }
574
- }
575
- return;
576
- }
577
-
578
- $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
579
- if ($end==0) {
580
- $parent = $this->parent;
581
- while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
582
- $end -= 1;
583
- $parent = $parent->parent;
584
- }
585
- $end += $parent->_[HDOM_INFO_END];
586
- }
587
-
588
- for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
589
- $node = $this->dom->nodes[$i];
590
-
591
- $pass = true;
592
-
593
- if ($tag==='*' && !$key) {
594
- if (in_array($node, $this->children, true))
595
- $ret[$i] = 1;
596
- continue;
597
- }
598
-
599
- // compare tag
600
- if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
601
- // compare key
602
- if ($pass && $key) {
603
- if ($no_key) {
604
- if (isset($node->attr[$key])) $pass=false;
605
- } else {
606
- if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
607
- }
608
- }
609
- // compare value
610
- if ($pass && $key && $val && $val!=='*') {
611
- // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
612
- if ($key == "plaintext") {
613
- // $node->plaintext actually returns $node->text();
614
- $nodeKeyValue = $node->text();
615
- } else {
616
- // this is a normal search, we want the value of that attribute of the tag.
617
- $nodeKeyValue = $node->attr[$key];
618
- }
619
- if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
620
-
621
- //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
622
- if ($lowercase) {
623
- $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
624
- } else {
625
- $check = $this->match($exp, $val, $nodeKeyValue);
626
- }
627
- if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
628
-
629
- // handle multiple class
630
- if (!$check && strcasecmp($key, 'class')===0) {
631
- foreach (explode(' ',$node->attr[$key]) as $k) {
632
- // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
633
- if (!empty($k)) {
634
- if ($lowercase) {
635
- $check = $this->match($exp, strtolower($val), strtolower($k));
636
- } else {
637
- $check = $this->match($exp, $val, $k);
638
- }
639
- if ($check) break;
640
- }
641
- }
642
- }
643
- if (!$check) $pass = false;
644
- }
645
- if ($pass) $ret[$i] = 1;
646
- unset($node);
647
- }
648
- // It's passed by reference so this is actually what this function returns.
649
- if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
650
- }
651
-
652
- protected function match($exp, $pattern, $value) {
653
- global $debug_object;
654
- if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
655
-
656
- switch ($exp) {
657
- case '=':
658
- return ($value===$pattern);
659
- case '!=':
660
- return ($value!==$pattern);
661
- case '^=':
662
- return preg_match("/^".preg_quote($pattern,'/')."/", $value);
663
- case '$=':
664
- return preg_match("/".preg_quote($pattern,'/')."$/", $value);
665
- case '*=':
666
- if ($pattern[0]=='/') {
667
- return preg_match($pattern, $value);
668
- }
669
- return preg_match("/".$pattern."/i", $value);
670
- }
671
- return false;
672
- }
673
-
674
- protected function parse_selector($selector_string) {
675
- global $debug_object;
676
- if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
677
-
678
- // pattern of CSS selectors, modified from mootools
679
- // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
680
- // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
681
- // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
682
- // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
683
- // farther study is required to determine of this should be documented or removed.
684
- // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
685
- $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
686
- preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
687
- if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
688
-
689
- $selectors = array();
690
- $result = array();
691
- //print_r($matches);
692
-
693
- foreach ($matches as $m) {
694
- $m[0] = trim($m[0]);
695
- if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
696
- // for browser generated xpath
697
- if ($m[1]==='tbody') continue;
698
-
699
- list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
700
- if (!empty($m[2])) {$key='id'; $val=$m[2];}
701
- if (!empty($m[3])) {$key='class'; $val=$m[3];}
702
- if (!empty($m[4])) {$key=$m[4];}
703
- if (!empty($m[5])) {$exp=$m[5];}
704
- if (!empty($m[6])) {$val=$m[6];}
705
-
706
- // convert to lowercase
707
- if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
708
- //elements that do NOT have the specified attribute
709
- if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
710
-
711
- $result[] = array($tag, $key, $val, $exp, $no_key);
712
- if (trim($m[7])===',') {
713
- $selectors[] = $result;
714
- $result = array();
715
- }
716
- }
717
- if (count($result)>0)
718
- $selectors[] = $result;
719
- return $selectors;
720
- }
721
-
722
- function __get($name)
723
- {
724
- if (isset($this->attr[$name]))
725
- {
726
- return $this->convert_text($this->attr[$name]);
727
- }
728
- switch ($name)
729
- {
730
- case 'outertext': return $this->outertext();
731
- case 'innertext': return $this->innertext();
732
- case 'plaintext': return $this->text();
733
- case 'xmltext': return $this->xmltext();
734
- default: return array_key_exists($name, $this->attr);
735
- }
736
- }
737
-
738
- function __set($name, $value)
739
- {
740
- global $debug_object;
741
- if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
742
-
743
- switch ($name)
744
- {
745
- case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
746
- case 'innertext':
747
- if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
748
- return $this->_[HDOM_INFO_INNER] = $value;
749
- }
750
- if (!isset($this->attr[$name]))
751
- {
752
- $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
753
- $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
754
- }
755
- $this->attr[$name] = $value;
756
- }
757
-
758
- function __isset($name)
759
- {
760
- switch ($name)
761
- {
762
- case 'outertext': return true;
763
- case 'innertext': return true;
764
- case 'plaintext': return true;
765
- }
766
- //no value attr: nowrap, checked selected...
767
- return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
768
- }
769
-
770
- function __unset($name) {
771
- if (isset($this->attr[$name]))
772
- unset($this->attr[$name]);
773
- }
774
-
775
- // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
776
- function convert_text($text)
777
- {
778
- global $debug_object;
779
- if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
780
-
781
- $converted_text = $text;
782
-
783
- $sourceCharset = "";
784
- $targetCharset = "";
785
-
786
- if ($this->dom)
787
- {
788
- $sourceCharset = strtoupper($this->dom->_charset);
789
- $targetCharset = strtoupper($this->dom->_target_charset);
790
- }
791
- if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
792
-
793
- if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
794
- {
795
- // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
796
- if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
797
- {
798
- $converted_text = $text;
799
- }
800
- else
801
- {
802
- $converted_text = iconv($sourceCharset, $targetCharset, $text);
803
- }
804
- }
805
-
806
- // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
807
- if ($targetCharset == 'UTF-8')
808
- {
809
- if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
810
- {
811
- $converted_text = substr($converted_text, 3);
812
- }
813
- if (substr($converted_text, -3) == "\xef\xbb\xbf")
814
- {
815
- $converted_text = substr($converted_text, 0, -3);
816
- }
817
- }
818
-
819
- return $converted_text;
820
- }
821
-
822
- /**
823
- * Returns true if $string is valid UTF-8 and false otherwise.
824
- *
825
- * @param mixed $str String to be tested
826
- * @return boolean
827
- */
828
- static function is_utf8($str)
829
- {
830
- $c=0; $b=0;
831
- $bits=0;
832
- $len=strlen($str);
833
- for($i=0; $i<$len; $i++)
834
- {
835
- $c=ord($str[$i]);
836
- if($c > 128)
837
- {
838
- if(($c >= 254)) return false;
839
- elseif($c >= 252) $bits=6;
840
- elseif($c >= 248) $bits=5;
841
- elseif($c >= 240) $bits=4;
842
- elseif($c >= 224) $bits=3;
843
- elseif($c >= 192) $bits=2;
844
- else return false;
845
- if(($i+$bits) > $len) return false;
846
- while($bits > 1)
847
- {
848
- $i++;
849
- $b=ord($str[$i]);
850
- if($b < 128 || $b > 191) return false;
851
- $bits--;
852
- }
853
- }
854
- }
855
- return true;
856
- }
857
- /*
858
- function is_utf8($string)
859
- {
860
- //this is buggy
861
- return (utf8_encode(utf8_decode($string)) == $string);
862
- }
863
- */
864
-
865
- /**
866
- * Function to try a few tricks to determine the displayed size of an img on the page.
867
- * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
868
- *
869
- * @author John Schlick
870
- * @version April 19 2012
871
- * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
872
- */
873
- function get_display_size()
874
- {
875
- global $debug_object;
876
-
877
- $width = -1;
878
- $height = -1;
879
-
880
- if ($this->tag !== 'img')
881
- {
882
- return false;
883
- }
884
-
885
- // See if there is aheight or width attribute in the tag itself.
886
- if (isset($this->attr['width']))
887
- {
888
- $width = $this->attr['width'];
889
- }
890
-
891
- if (isset($this->attr['height']))
892
- {
893
- $height = $this->attr['height'];
894
- }
895
-
896
- // Now look for an inline style.
897
- if (isset($this->attr['style']))
898
- {
899
- // Thanks to user gnarf from stackoverflow for this regular expression.
900
- $attributes = array();
901
- preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
902
- foreach ($matches as $match) {
903
- $attributes[$match[1]] = $match[2];
904
- }
905
-
906
- // If there is a width in the style attributes:
907
- if (isset($attributes['width']) && $width == -1)
908
- {
909
- // check that the last two characters are px (pixels)
910
- if (strtolower(substr($attributes['width'], -2)) == 'px')
911
- {
912
- $proposed_width = substr($attributes['width'], 0, -2);
913
- // Now make sure that it's an integer and not something stupid.
914
- if (filter_var($proposed_width, FILTER_VALIDATE_INT))
915
- {
916
- $width = $proposed_width;
917
- }
918
- }
919
- }
920
-
921
- // If there is a width in the style attributes:
922
- if (isset($attributes['height']) && $height == -1)
923
- {
924
- // check that the last two characters are px (pixels)
925
- if (strtolower(substr($attributes['height'], -2)) == 'px')
926
- {
927
- $proposed_height = substr($attributes['height'], 0, -2);
928
- // Now make sure that it's an integer and not something stupid.
929
- if (filter_var($proposed_height, FILTER_VALIDATE_INT))
930
- {
931
- $height = $proposed_height;
932
- }
933
- }
934
- }
935
-
936
- }
937
-
938
- // Future enhancement:
939
- // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
940
-
941
- // Far future enhancement
942
- // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
943
- // Note that in this case, the class or id will have the img subselector for it to apply to the image.
944
-
945
- // ridiculously far future development
946
- // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
947
-
948
- $result = array('height' => $height,
949
- 'width' => $width);
950
- return $result;
951
- }
952
-
953
- // camel naming conventions
954
- function getAllAttributes() {return $this->attr;}
955
- function getAttribute($name) {return $this->__get($name);}
956
- function setAttribute($name, $value) {$this->__set($name, $value);}
957
- function hasAttribute($name) {return $this->__isset($name);}
958
- function removeAttribute($name) {$this->__set($name, null);}
959
- function getElementById($id) {return $this->find("#$id", 0);}
960
- function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
961
- function getElementByTagName($name) {return $this->find($name, 0);}
962
- function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
963
- function parentNode() {return $this->parent();}
964
- function childNodes($idx=-1) {return $this->children($idx);}
965
- function firstChild() {return $this->first_child();}
966
- function lastChild() {return $this->last_child();}
967
- function nextSibling() {return $this->next_sibling();}
968
- function previousSibling() {return $this->prev_sibling();}
969
- function hasChildNodes() {return $this->has_child();}
970
- function nodeName() {return $this->tag;}
971
- function appendChild($node) {$node->parent($this); return $node;}
972
-
973
- }
974
-
975
- /**
976
- * simple html dom parser
977
- * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
978
- * Paperg - change $size from protected to public so we can easily access it
979
- * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
980
- *
981
- * @package PlaceLocalInclude
982
- */
983
- class simple_html_dom
984
- {
985
- public $root = null;
986
- public $nodes = array();
987
- public $callback = null;
988
- public $lowercase = false;
989
- // Used to keep track of how large the text was when we started.
990
- public $original_size;
991
- public $size;
992
- protected $pos;
993
- protected $doc;
994
- protected $char;
995
- protected $cursor;
996
- protected $parent;
997
- protected $noise = array();
998
- protected $token_blank = " \t\r\n";
999
- protected $token_equal = ' =/>';
1000
- protected $token_slash = " />\r\n\t";
1001
- protected $token_attr = ' >';
1002
- // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1003
- public $_charset = '';
1004
- public $_target_charset = '';
1005
- protected $default_br_text = "";
1006
- public $default_span_text = "";
1007
-
1008
- // use isset instead of in_array, performance boost about 30%...
1009
- protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
1010
- protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1011
- // Known sourceforge issue #2977341
1012
- // B tags that are not closed cause us to return everything to the end of the document.
1013
- protected $optional_closing_tags = array(
1014
- 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1015
- 'th'=>array('th'=>1),
1016
- 'td'=>array('td'=>1),
1017
- 'li'=>array('li'=>1),
1018
- 'dt'=>array('dt'=>1, 'dd'=>1),
1019
- 'dd'=>array('dd'=>1, 'dt'=>1),
1020
- 'dl'=>array('dd'=>1, 'dt'=>1),
1021
- 'p'=>array('p'=>1),
1022
- 'nobr'=>array('nobr'=>1),
1023
- 'b'=>array('b'=>1),
1024
- 'option'=>array('option'=>1),
1025
- );
1026
-
1027
- function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $stripRN=true)
1028
- {
1029
- if ($str)
1030
- {
1031
- if (preg_match("/^http:\/\//i",$str) || is_file($str))
1032
- {
1033
- $this->load_file($str);
1034
- }
1035
- else
1036
- {
1037
- $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1038
- }
1039
- }
1040
- // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1041
- if (!$forceTagsClosed) {
1042
- $this->optional_closing_array=array();
1043
- }
1044
- $this->_target_charset = 'UTF-8';
1045
- }
1046
-
1047
- function __destruct()
1048
- {
1049
- $this->clear();
1050
- }
1051
-
1052
- // load html from string
1053
- function load($str, $lowercase=true, $stripRN=true)
1054
- {
1055
- global $debug_object;
1056
-
1057
- // prepare
1058
- $this->prepare($str, $lowercase, $stripRN, '', '');
1059
- // strip out cdata
1060
- $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1061
- // strip out comments
1062
- $this->remove_noise("'<!--(.*?)-->'is");
1063
- // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1064
- // Script tags removal now preceeds style tag removal.
1065
- // strip out <script> tags
1066
- $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1067
- $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1068
- // strip out <style> tags
1069
- $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1070
- $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1071
- // strip out preformatted tags
1072
- $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1073
- // strip out server side scripts
1074
- $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1075
- // strip smarty scripts
1076
- $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1077
-
1078
- // parsing
1079
- while ($this->parse());
1080
- // end
1081
- $this->root->_[HDOM_INFO_END] = $this->cursor;
1082
- $this->parse_charset();
1083
-
1084
- // make load function chainable
1085
- return $this;
1086
-
1087
- }
1088
-
1089
- // load html from file
1090
- function load_file()
1091
- {
1092
- $args = func_get_args();
1093
- $this->load(call_user_func_array('file_get_contents', $args), true);
1094
- // Throw an error if we can't properly load the dom.
1095
- if (($error=error_get_last())!==null) {
1096
- $this->clear();
1097
- return false;
1098
- }
1099
- }
1100
-
1101
- // set callback function
1102
- function set_callback($function_name)
1103
- {
1104
- $this->callback = $function_name;
1105
- }
1106
-
1107
- // remove callback function
1108
- function remove_callback()
1109
- {
1110
- $this->callback = null;
1111
- }
1112
-
1113
- // save dom as string
1114
- function save($filepath='')
1115
- {
1116
- $ret = $this->root->innertext();
1117
- if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1118
- return $ret;
1119
- }
1120
-
1121
- // find dom node by css selector
1122
- // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1123
- function find($selector, $idx=null, $lowercase=false)
1124
- {
1125
- return $this->root->find($selector, $idx, $lowercase);
1126
- }
1127
-
1128
- // clean up memory due to php5 circular references memory leak...
1129
- function clear()
1130
- {
1131
- foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1132
- // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1133
- if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1134
- if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1135
- if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1136
- unset($this->doc);
1137
- unset($this->noise);
1138
- }
1139
-
1140
- function dump($show_attr=true)
1141
- {
1142
- $this->root->dump($show_attr);
1143
- }
1144
-
1145
- // prepare HTML data and init everything
1146
- protected function prepare($str, $lowercase=true, $stripRN=true)
1147
- {
1148
- $this->clear();
1149
-
1150
- // set the length of content before we do anything to it.
1151
- $this->size = strlen($str);
1152
- // Save the original size of the html that we got in. It might be useful to someone.
1153
- $this->original_size = $this->size;
1154
-
1155
- //before we save the string as the doc... strip out the \r \n's if we are told to.
1156
- $stripRN = false;
1157
-
1158
- $this->doc = $str;
1159
- $this->pos = 0;
1160
- $this->cursor = 1;
1161
- $this->noise = array();
1162
- $this->nodes = array();
1163
- $this->lowercase = $lowercase;
1164
- $this->default_br_text = "\r\n";
1165
- $this->default_span_text = " ";
1166
- $this->root = new simple_html_dom_node($this);
1167
- $this->root->tag = 'root';
1168
- $this->root->_[HDOM_INFO_BEGIN] = -1;
1169
- $this->root->nodetype = HDOM_TYPE_ROOT;
1170
- $this->parent = $this->root;
1171
- //if ($this->size>0) $this->char = $this->doc[0];
1172
- }
1173
-
1174
- // parse html content
1175
- protected function parse()
1176
- {
1177
- if (($s = $this->copy_until_char('<'))==='')
1178
- {
1179
- return $this->read_tag();
1180
- }
1181
-
1182
- // text
1183
- $node = new simple_html_dom_node($this);
1184
- ++$this->cursor;
1185
- $node->_[HDOM_INFO_TEXT] = $s;
1186
- $this->link_nodes($node, false);
1187
- return true;
1188
- }
1189
-
1190
- // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1191
- // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1192
- // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1193
- protected function parse_charset()
1194
- {
1195
- global $debug_object;
1196
-
1197
- $charset = null;
1198
-
1199
- if (function_exists('get_last_retrieve_url_contents_content_type'))
1200
- {
1201
- $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1202
- $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1203
- if ($success)
1204
- {
1205
- $charset = $matches[1];
1206
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1207
- }
1208
-
1209
- }
1210
-
1211
- if (empty($charset))
1212
- {
1213
- $el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1214
- if (!empty($el))
1215
- {
1216
- $fullvalue = $el->content;
1217
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1218
-
1219
- if (!empty($fullvalue))
1220
- {
1221
- $success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1222
- if ($success)
1223
- {
1224
- $charset = $matches[1];
1225
- }
1226
- else
1227
- {
1228
- // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1229
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1230
- $charset = 'ISO-8859-1';
1231
- }
1232
- }
1233
- }
1234
- }
1235
-
1236
- // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1237
- if (empty($charset))
1238
- {
1239
- // Use this in case mb_detect_charset isn't installed/loaded on this machine.
1240
- $charset = false;
1241
- if (function_exists('mb_detect_encoding'))
1242
- {
1243
- // Have php try to detect the encoding from the text given to us.
1244
- $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1245
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1246
- }
1247
-
1248
- // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1249
- if ($charset === false)
1250
- {
1251
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1252
- $charset = 'UTF-8';
1253
- }
1254
- }
1255
-
1256
- // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1257
- if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1258
- {
1259
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1260
- $charset = 'CP1252';
1261
- }
1262
-
1263
- if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1264
-
1265
- return $this->_charset = $charset;
1266
- }
1267
-
1268
- // read tag info
1269
- protected function read_tag()
1270
- {
1271
- if ($this->char!=='<')
1272
- {
1273
- $this->root->_[HDOM_INFO_END] = $this->cursor;
1274
- return false;
1275
- }
1276
- $begin_tag_pos = $this->pos;
1277
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1278
-
1279
- // end tag
1280
- if ($this->char==='/')
1281
- {
1282
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1283
- // This represents the change in the simple_html_dom trunk from revision 180 to 181.
1284
- // $this->skip($this->token_blank_t);
1285
- $this->skip($this->token_blank);
1286
- $tag = $this->copy_until_char('>');
1287
-
1288
- // skip attributes in end tag
1289
- if (($pos = strpos($tag, ' '))!==false)
1290
- $tag = substr($tag, 0, $pos);
1291
-
1292
- $parent_lower = strtolower($this->parent->tag);
1293
- $tag_lower = strtolower($tag);
1294
-
1295
- if ($parent_lower!==$tag_lower)
1296
- {
1297
- if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1298
- {
1299
- $this->parent->_[HDOM_INFO_END] = 0;
1300
- $org_parent = $this->parent;
1301
-
1302
- while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1303
- $this->parent = $this->parent->parent;
1304
-
1305
- if (strtolower($this->parent->tag)!==$tag_lower) {
1306
- $this->parent = $org_parent; // restore origonal parent
1307
- if ($this->parent->parent) $this->parent = $this->parent->parent;
1308
- $this->parent->_[HDOM_INFO_END] = $this->cursor;
1309
- return $this->as_text_node($tag);
1310
- }
1311
- }
1312
- else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1313
- {
1314
- $this->parent->_[HDOM_INFO_END] = 0;
1315
- $org_parent = $this->parent;
1316
-
1317
- while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1318
- $this->parent = $this->parent->parent;
1319
-
1320
- if (strtolower($this->parent->tag)!==$tag_lower)
1321
- {
1322
- $this->parent = $org_parent; // restore origonal parent
1323
- $this->parent->_[HDOM_INFO_END] = $this->cursor;
1324
- return $this->as_text_node($tag);
1325
- }
1326
- }
1327
- else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1328
- {
1329
- $this->parent->_[HDOM_INFO_END] = 0;
1330
- $this->parent = $this->parent->parent;
1331
- }
1332
- else
1333
- return $this->as_text_node($tag);
1334
- }
1335
-
1336
- $this->parent->_[HDOM_INFO_END] = $this->cursor;
1337
- if ($this->parent->parent) $this->parent = $this->parent->parent;
1338
-
1339
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1340
- return true;
1341
- }
1342
-
1343
- $node = new simple_html_dom_node($this);
1344
- $node->_[HDOM_INFO_BEGIN] = $this->cursor;
1345
- ++$this->cursor;
1346
- $tag = $this->copy_until($this->token_slash);
1347
- $node->tag_start = $begin_tag_pos;
1348
-
1349
- // doctype, cdata & comments...
1350
- if (isset($tag[0]) && $tag[0]==='!') {
1351
- $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1352
-
1353
- if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
1354
- $node->nodetype = HDOM_TYPE_COMMENT;
1355
- $node->tag = 'comment';
1356
- } else {
1357
- $node->nodetype = HDOM_TYPE_UNKNOWN;
1358
- $node->tag = 'unknown';
1359
- }
1360
- if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1361
- $this->link_nodes($node, true);
1362
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1363
- return true;
1364
- }
1365
-
1366
- // text
1367
- if ($pos=strpos($tag, '<')!==false) {
1368
- $tag = '<' . substr($tag, 0, -1);
1369
- $node->_[HDOM_INFO_TEXT] = $tag;
1370
- $this->link_nodes($node, false);
1371
- $this->char = $this->doc[--$this->pos]; // prev
1372
- return true;
1373
- }
1374
-
1375
- if (!preg_match("/^[\w-:]+$/", $tag)) {
1376
- $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1377
- if ($this->char==='<') {
1378
- $this->link_nodes($node, false);
1379
- return true;
1380
- }
1381
-
1382
- if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1383
- $this->link_nodes($node, false);
1384
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1385
- return true;
1386
- }
1387
-
1388
- // begin tag
1389
- $node->nodetype = HDOM_TYPE_ELEMENT;
1390
- $tag_lower = strtolower($tag);
1391
- $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1392
-
1393
- // handle optional closing tags
1394
- if (isset($this->optional_closing_tags[$tag_lower]) )
1395
- {
1396
- while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1397
- {
1398
- $this->parent->_[HDOM_INFO_END] = 0;
1399
- $this->parent = $this->parent->parent;
1400
- }
1401
- $node->parent = $this->parent;
1402
- }
1403
-
1404
- $guard = 0; // prevent infinity loop
1405
- $space = array($this->copy_skip($this->token_blank), '', '');
1406
-
1407
- // attributes
1408
- do
1409
- {
1410
- if ($this->char!==null && $space[0]==='')
1411
- {
1412
- break;
1413
- }
1414
- $name = $this->copy_until($this->token_equal);
1415
- if ($guard===$this->pos)
1416
- {
1417
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1418
- continue;
1419
- }
1420
- $guard = $this->pos;
1421
-
1422
- // handle endless '<'
1423
- if ($this->pos>=$this->size-1 && $this->char!=='>') {
1424
- $node->nodetype = HDOM_TYPE_TEXT;
1425
- $node->_[HDOM_INFO_END] = 0;
1426
- $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1427
- $node->tag = 'text';
1428
- $this->link_nodes($node, false);
1429
- return true;
1430
- }
1431
-
1432
- // handle mismatch '<'
1433
- if ($this->doc[$this->pos-1]=='<') {
1434
- $node->nodetype = HDOM_TYPE_TEXT;
1435
- $node->tag = 'text';
1436
- $node->attr = array();
1437
- $node->_[HDOM_INFO_END] = 0;
1438
- $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1439
- $this->pos -= 2;
1440
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1441
- $this->link_nodes($node, false);
1442
- return true;
1443
- }
1444
-
1445
- if ($name!=='/' && $name!=='') {
1446
- $space[1] = $this->copy_skip($this->token_blank);
1447
- $name = $this->restore_noise($name);
1448
- if ($this->lowercase) $name = strtolower($name);
1449
- if ($this->char==='=') {
1450
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1451
- $this->parse_attr($node, $name, $space);
1452
- }
1453
- else {
1454
- //no value attr: nowrap, checked selected...
1455
- $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1456
- $node->attr[$name] = true;
1457
- if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1458
- }
1459
- $node->_[HDOM_INFO_SPACE][] = $space;
1460
- $space = array($this->copy_skip($this->token_blank), '', '');
1461
- }
1462
- else
1463
- break;
1464
- } while ($this->char!=='>' && $this->char!=='/');
1465
-
1466
- $this->link_nodes($node, true);
1467
- $node->_[HDOM_INFO_ENDSPACE] = $space[0];
1468
-
1469
- // check self closing
1470
- if ($this->copy_until_char_escape('>')==='/')
1471
- {
1472
- $node->_[HDOM_INFO_ENDSPACE] .= '/';
1473
- $node->_[HDOM_INFO_END] = 0;
1474
- }
1475
- else
1476
- {
1477
- // reset parent
1478
- if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1479
- }
1480
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1481
-
1482
- // If it's a BR tag, we need to set it's text to the default text.
1483
- // This way when we see it in plaintext, we can generate formatting that the user wants.
1484
- // since a br tag never has sub nodes, this works well.
1485
- if ($node->tag == "br")
1486
- {
1487
- $node->_[HDOM_INFO_INNER] = $this->default_br_text;
1488
- }
1489
-
1490
- return true;
1491