Gallery Custom Links - Version 1.1.4

Version Description

  • Fix: Attempt to fix the way autoload is working.
  • Info: If you like the plugin, your reviews are welcome here :) Thank you!
Download this release

Release Info

Developer TigrouMeow
Plugin Icon 128x128 Gallery Custom Links
Version 1.1.4
Comparing to
See all releases

Code changes from version 1.1.3 to 1.1.4

composer.lock CHANGED
@@ -57,16 +57,16 @@
57
  },
58
  {
59
  "name": "kub-at/php-simple-html-dom-parser",
60
- "version": "1.7.1",
61
  "source": {
62
  "type": "git",
63
  "url": "https://github.com/Kub-AT/php-simple-html-dom-parser.git",
64
- "reference": "7a745b20157efb0f1be3021394769bd6b8e9ed4e"
65
  },
66
  "dist": {
67
  "type": "zip",
68
- "url": "https://api.github.com/repos/Kub-AT/php-simple-html-dom-parser/zipball/7a745b20157efb0f1be3021394769bd6b8e9ed4e",
69
- "reference": "7a745b20157efb0f1be3021394769bd6b8e9ed4e",
70
  "shasum": ""
71
  },
72
  "require": {
@@ -99,7 +99,7 @@
99
  "dom",
100
  "html"
101
  ],
102
- "time": "2019-01-02T14:33:28+00:00"
103
  }
104
  ],
105
  "packages-dev": [],
57
  },
58
  {
59
  "name": "kub-at/php-simple-html-dom-parser",
60
+ "version": "1.8.1",
61
  "source": {
62
  "type": "git",
63
  "url": "https://github.com/Kub-AT/php-simple-html-dom-parser.git",
64
+ "reference": "6db1e01db320040024cd1f74b0e1483aa2670720"
65
  },
66
  "dist": {
67
  "type": "zip",
68
+ "url": "https://api.github.com/repos/Kub-AT/php-simple-html-dom-parser/zipball/6db1e01db320040024cd1f74b0e1483aa2670720",
69
+ "reference": "6db1e01db320040024cd1f74b0e1483aa2670720",
70
  "shasum": ""
71
  },
72
  "require": {
99
  "dom",
100
  "html"
101
  ],
102
+ "time": "2019-03-05T14:12:22+00:00"
103
  }
104
  ],
105
  "packages-dev": [],
gallery_custom_links.php CHANGED
@@ -3,7 +3,7 @@
3
  Plugin Name: Gallery Custom Links
4
  Plugin URI: https://meowapps.com
5
  Description: Gallery Custom Links allows you to link images from galleries to a specified URL. Tested with WordPress Gallery, Gutenberg, the Meow Gallery and others.
6
- Version: 1.1.3
7
  Author: Jordy Meow
8
  Author URI: https://meowapps.com
9
  Text Domain: gallery-custom-links
@@ -23,7 +23,7 @@ if ( class_exists( 'Meow_Gallery_Custom_Links' ) ) {
23
  }
24
 
25
  global $mgcl_version;
26
- $mgcl_version = '1.1.3';
27
 
28
  include "mgcl_admin.php";
29
  $mgcl_admin = new Meow_Gallery_Custom_Links_Admin( 'mgcl', __FILE__, 'gallery-custom-links' );
3
  Plugin Name: Gallery Custom Links
4
  Plugin URI: https://meowapps.com
5
  Description: Gallery Custom Links allows you to link images from galleries to a specified URL. Tested with WordPress Gallery, Gutenberg, the Meow Gallery and others.
6
+ Version: 1.1.4
7
  Author: Jordy Meow
8
  Author URI: https://meowapps.com
9
  Text Domain: gallery-custom-links
23
  }
24
 
25
  global $mgcl_version;
26
+ $mgcl_version = '1.1.4';
27
 
28
  include "mgcl_admin.php";
29
  $mgcl_admin = new Meow_Gallery_Custom_Links_Admin( 'mgcl', __FILE__, 'gallery-custom-links' );
mgcl_core.php CHANGED
@@ -1,10 +1,6 @@
1
  <?php
2
 
3
- require_once 'vendor/autoload.php';
4
-
5
- use DiDom\Document;
6
- use DiDom\Element;
7
- use KubAT\PhpSimple\HtmlDomParser;
8
 
9
  class Meow_Gallery_Custom_Links
10
  {
@@ -85,17 +81,21 @@ class Meow_Gallery_Custom_Links
85
  }
86
 
87
  function linkify_element( $element ) {
 
 
88
 
89
- $classes = $this->parsingEngine === 'HtmlDomParser' ? $element->class : $element->attr('class');
 
90
 
91
- $mediaId = null;
 
 
 
 
 
92
 
93
- // Check if the wp-image-xxx class exists
94
- $url = null;
95
- if ( preg_match( '/wp-image-([0-9]{1,10})/i', $classes, $matches ) )
96
- $mediaId = $matches[1];
97
- // Otherwise, resolve the ID from the URL
98
- else {
99
  $url = $this->parsingEngine === 'HtmlDomParser' ? $element->src : $element->attr('src');
100
  $mediaId = $this->resolve_image_id( $url );
101
  }
@@ -164,7 +164,7 @@ class Meow_Gallery_Custom_Links
164
  else {
165
  if ( $parent->tag === 'figure' )
166
  $parent = $parent->parent();
167
- $a = new Element('a');
168
  $a->attr( 'href', $url );
169
  $a->attr( 'class', 'custom-link no-lightbox' );
170
  $a->attr( 'onclick', 'event.stopPropagation()' );
@@ -188,11 +188,11 @@ class Meow_Gallery_Custom_Links
188
  return $buffer;
189
 
190
  if ( $this->parsingEngine === 'HtmlDomParser' ) {
191
- $html = new HtmlDomParser();
192
  $html = $html->str_get_html( $buffer, true, true, DEFAULT_TARGET_CHARSET, false );
193
  }
194
  else {
195
- $html = new Document();
196
  $html->preserveWhiteSpace();
197
  if ( defined( 'LIBXML_HTML_NOIMPLIED' ) && defined( 'LIBXML_HTML_NODEFDTD' ) )
198
  $html->loadHtml( $buffer, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
1
  <?php
2
 
3
+ require __DIR__ . '/vendor/autoload.php';
 
 
 
 
4
 
5
  class Meow_Gallery_Custom_Links
6
  {
81
  }
82
 
83
  function linkify_element( $element ) {
84
+ $mediaId = null;
85
+ $url = null;
86
 
87
+ // 1. If there is an Attachment ID
88
+ $mediaId = $this->parsingEngine === 'HtmlDomParser' ? $element->{'data-attachment-id'} : $element->attr('data-attachment-id');
89
 
90
+ // 2. Check if the wp-image-xxx class exists
91
+ if ( empty( $mediaId ) ) {
92
+ $classes = $this->parsingEngine === 'HtmlDomParser' ? $element->class : $element->attr('class');
93
+ if ( preg_match( '/wp-image-([0-9]{1,10})/i', $classes, $matches ) )
94
+ $mediaId = $matches[1];
95
+ }
96
 
97
+ // 3. Otherwise, resolve the ID from the URL
98
+ if ( empty( $mediaId ) ) {
 
 
 
 
99
  $url = $this->parsingEngine === 'HtmlDomParser' ? $element->src : $element->attr('src');
100
  $mediaId = $this->resolve_image_id( $url );
101
  }
164
  else {
165
  if ( $parent->tag === 'figure' )
166
  $parent = $parent->parent();
167
+ $a = new DiDom\Element('a');
168
  $a->attr( 'href', $url );
169
  $a->attr( 'class', 'custom-link no-lightbox' );
170
  $a->attr( 'onclick', 'event.stopPropagation()' );
188
  return $buffer;
189
 
190
  if ( $this->parsingEngine === 'HtmlDomParser' ) {
191
+ $html = new KubAT\PhpSimple\HtmlDomParser();
192
  $html = $html->str_get_html( $buffer, true, true, DEFAULT_TARGET_CHARSET, false );
193
  }
194
  else {
195
+ $html = new DiDom\Document();
196
  $html->preserveWhiteSpace();
197
  if ( defined( 'LIBXML_HTML_NOIMPLIED' ) && defined( 'LIBXML_HTML_NODEFDTD' ) )
198
  $html->loadHtml( $buffer, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
readme.txt CHANGED
@@ -4,7 +4,7 @@ Tags: custom, links, gallery, gutenberg
4
  Requires at least: 5.0
5
  Tested up to: 5.1
6
  Requires PHP: 7.0
7
- Stable tag: 1.1.3
8
 
9
  Gallery Custom Links allows you to link images from galleries to a specified URL. Tested with WordPress Gallery, Gutenberg, the Meow Gallery and others.
10
 
@@ -41,9 +41,12 @@ Replace all the files. Nothing else to do.
41
 
42
  == Changelog ==
43
 
 
 
 
 
44
  = 1.1.3 =
45
  * Fix: Avoid analyzing the html content if the parser returned a boolean or an empty string.
46
- * Info: If you like the plugin, your reviews are welcome [here](https://wordpress.org/support/plugin/gallery-custom-links/reviews/?rate=5#new-post) :) Thank you!
47
 
48
  = 1.1.2 =
49
  * Add: Rel can now be set to nofollow.
4
  Requires at least: 5.0
5
  Tested up to: 5.1
6
  Requires PHP: 7.0
7
+ Stable tag: 1.1.4
8
 
9
  Gallery Custom Links allows you to link images from galleries to a specified URL. Tested with WordPress Gallery, Gutenberg, the Meow Gallery and others.
10
 
41
 
42
  == Changelog ==
43
 
44
+ = 1.1.4 =
45
+ * Fix: Attempt to fix the way autoload is working.
46
+ * Info: If you like the plugin, your reviews are welcome [here](https://wordpress.org/support/plugin/gallery-custom-links/reviews/?rate=5#new-post) :) Thank you!
47
+
48
  = 1.1.3 =
49
  * Fix: Avoid analyzing the html content if the parser returned a boolean or an empty string.
 
50
 
51
  = 1.1.2 =
52
  * Add: Rel can now be set to nofollow.
vendor/composer/installed.json CHANGED
@@ -52,23 +52,23 @@
52
  },
53
  {
54
  "name": "kub-at/php-simple-html-dom-parser",
55
- "version": "1.7.1",
56
- "version_normalized": "1.7.1.0",
57
  "source": {
58
  "type": "git",
59
  "url": "https://github.com/Kub-AT/php-simple-html-dom-parser.git",
60
- "reference": "7a745b20157efb0f1be3021394769bd6b8e9ed4e"
61
  },
62
  "dist": {
63
  "type": "zip",
64
- "url": "https://api.github.com/repos/Kub-AT/php-simple-html-dom-parser/zipball/7a745b20157efb0f1be3021394769bd6b8e9ed4e",
65
- "reference": "7a745b20157efb0f1be3021394769bd6b8e9ed4e",
66
  "shasum": ""
67
  },
68
  "require": {
69
  "php": ">=5.3.2"
70
  },
71
- "time": "2019-01-02T14:33:28+00:00",
72
  "type": "library",
73
  "installation-source": "dist",
74
  "autoload": {
52
  },
53
  {
54
  "name": "kub-at/php-simple-html-dom-parser",
55
+ "version": "1.8.1",
56
+ "version_normalized": "1.8.1.0",
57
  "source": {
58
  "type": "git",
59
  "url": "https://github.com/Kub-AT/php-simple-html-dom-parser.git",
60
+ "reference": "6db1e01db320040024cd1f74b0e1483aa2670720"
61
  },
62
  "dist": {
63
  "type": "zip",
64
+ "url": "https://api.github.com/repos/Kub-AT/php-simple-html-dom-parser/zipball/6db1e01db320040024cd1f74b0e1483aa2670720",
65
+ "reference": "6db1e01db320040024cd1f74b0e1483aa2670720",
66
  "shasum": ""
67
  },
68
  "require": {
69
  "php": ">=5.3.2"
70
  },
71
+ "time": "2019-03-05T14:12:22+00:00",
72
  "type": "library",
73
  "installation-source": "dist",
74
  "autoload": {
vendor/kub-at/php-simple-html-dom-parser/README.md CHANGED
@@ -1,8 +1,8 @@
1
  php-simple-html-dom-parser
2
  ==========================
3
 
4
- Version 1.7.1 - PHP 7.3 campatible
5
- Changelog: https://sourceforge.net/projects/simplehtmldom/files/simplehtmldom/1.7/
6
 
7
 
8
  Install
1
  php-simple-html-dom-parser
2
  ==========================
3
 
4
+ Version 1.8.1 - PHP 7.3 compatible
5
+ PHP Simple HTML DOM Parser changelog: https://sourceforge.net/projects/simplehtmldom/files/simplehtmldom/1.8.1/
6
 
7
 
8
  Install
vendor/kub-at/php-simple-html-dom-parser/src/KubAT/PhpSimple/lib/simple_html_dom.php CHANGED
@@ -3,7 +3,7 @@ namespace simple_html_dom;
3
 
4
  /**
5
  * Website: http://sourceforge.net/projects/simplehtmldom/
6
- * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
7
  * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
8
  * Contributions by:
9
  * Yousuke Kumakura (Attribute filters)
@@ -13,21 +13,39 @@ namespace simple_html_dom;
13
  * all affected sections have comments starting with "PaperG"
14
  *
15
  * Paperg - Added case insensitive testing of the value of the selector.
16
- * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
17
- * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
18
- * it will almost always be smaller by some amount.
19
- * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
20
- * but for most purposes, it's a really good estimation.
21
- * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
 
 
 
 
 
 
 
22
  * Allow the user to tell us how much they trust the html.
23
- * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
24
- * This allows for us to find tags based on the text they contain.
25
- * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
26
- * Paperg: added parse_charset so that we know about the character set of the source document.
27
- * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
28
- * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
29
  *
30
- * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
32
  *
33
  * Licensed under The MIT License
@@ -36,7 +54,7 @@ namespace simple_html_dom;
36
  * @author S.C. Chen <me578022@gmail.com>
37
  * @author John Schlick
38
  * @author Rus Carroll
39
- * @version Rev. 1.7 (214)
40
  * @package PlaceLocalInclude
41
  * @subpackage simple_html_dom
42
  */
@@ -47,25 +65,33 @@ namespace simple_html_dom;
47
  */
48
  define('HDOM_TYPE_ELEMENT', 1);
49
  define('HDOM_TYPE_COMMENT', 2);
50
- define('HDOM_TYPE_TEXT', 3);
51
- define('HDOM_TYPE_ENDTAG', 4);
52
- define('HDOM_TYPE_ROOT', 5);
53
  define('HDOM_TYPE_UNKNOWN', 6);
54
  define('HDOM_QUOTE_DOUBLE', 0);
55
  define('HDOM_QUOTE_SINGLE', 1);
56
- define('HDOM_QUOTE_NO', 3);
57
- define('HDOM_INFO_BEGIN', 0);
58
- define('HDOM_INFO_END', 1);
59
- define('HDOM_INFO_QUOTE', 2);
60
- define('HDOM_INFO_SPACE', 3);
61
- define('HDOM_INFO_TEXT', 4);
62
- define('HDOM_INFO_INNER', 5);
63
- define('HDOM_INFO_OUTER', 6);
64
- define('HDOM_INFO_ENDSPACE',7);
65
- define('DEFAULT_TARGET_CHARSET', 'UTF-8');
66
- define('DEFAULT_BR_TEXT', "\r\n");
67
- define('DEFAULT_SPAN_TEXT', " ");
68
- define('MAX_FILE_SIZE', 600000);
 
 
 
 
 
 
 
 
69
 
70
  /** Contents between curly braces "{" and "}" are interpreted as text */
71
  define('HDOM_SMARTY_AS_TEXT', 1);
@@ -74,50 +100,94 @@ define('HDOM_SMARTY_AS_TEXT', 1);
74
  // -----------------------------------------------------------------------------
75
  // get html dom from file
76
  // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
77
- function file_get_html($url, $use_include_path = false, $context=null, $offset = 0, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
 
 
 
 
 
 
 
 
 
 
 
78
  {
79
  // Ensure maximum length is greater than zero
80
  if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
81
 
82
  // We DO force the tags to be terminated.
83
- $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
84
- // For sourceforge users: uncomment the next line and comment the retrieve_url_contents line 2 lines down if it is not already done.
85
- $contents = file_get_contents($url, $use_include_path, $context, $offset, $maxLen);
86
- // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
87
- //$contents = retrieve_url_contents($url);
88
- if (empty($contents) || strlen($contents) > $maxLen)
89
- {
90
- return false;
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  // The second parameter can force the selectors to all be lowercase.
93
  $dom->load($contents, $lowercase, $stripRN);
94
  return $dom;
95
  }
96
 
97
  // get html dom from string
98
- function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
 
 
 
 
 
 
 
99
  {
100
- $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
101
- if (empty($str) || strlen($str) > MAX_FILE_SIZE)
102
- {
 
 
 
 
 
 
 
103
  $dom->clear();
104
  return false;
105
  }
 
106
  $dom->load($str, $lowercase, $stripRN);
107
  return $dom;
108
  }
109
 
110
  // dump html dom tree
111
- function dump_html_tree($node, $show_attr=true, $deep=0)
112
  {
113
  $node->dump($node);
114
  }
115
 
116
-
117
  /**
118
  * simple html dom node
119
- * PaperG - added ability for "find" routine to lowercase the value of the selector.
120
- * PaperG - added $tag_start to track the start position of the tag in the total byte index
 
 
 
121
  *
122
  * @package PlaceLocalInclude
123
  */
@@ -211,91 +281,80 @@ class simple_html_dom_node
211
  }
212
 
213
  // dump node's tree
214
- function dump($show_attr=true, $deep=0)
215
  {
216
  $lead = str_repeat(' ', $deep);
217
 
218
- echo $lead.$this->tag;
219
- if ($show_attr && count($this->attr)>0)
220
- {
221
  echo '(';
222
- foreach ($this->attr as $k=>$v)
223
- echo "[$k]=>\"".$this->$k.'", ';
 
224
  echo ')';
225
  }
 
226
  echo "\n";
227
 
228
- if ($this->nodes)
229
- {
230
- foreach ($this->nodes as $c)
231
- {
232
- $c->dump($show_attr, $deep+1);
233
  }
234
  }
235
  }
236
 
237
 
238
  // Debugging function to dump a single dom node with a bunch of information about it.
239
- function dump_node($echo=true)
240
  {
241
-
242
  $string = $this->tag;
243
- if (count($this->attr)>0)
244
- {
245
  $string .= '(';
246
- foreach ($this->attr as $k=>$v)
247
- {
248
- $string .= "[$k]=>\"".$this->$k.'", ';
249
  }
250
  $string .= ')';
251
  }
252
- if (count($this->_)>0)
253
- {
254
  $string .= ' $_ (';
255
- foreach ($this->_ as $k=>$v)
256
- {
257
- if (is_array($v))
258
- {
259
  $string .= "[$k]=>(";
260
- foreach ($v as $k2=>$v2)
261
- {
262
- $string .= "[$k2]=>\"".$v2.'", ';
263
  }
264
- $string .= ")";
265
  } else {
266
- $string .= "[$k]=>\"".$v.'", ';
267
  }
268
  }
269
- $string .= ")";
270
  }
271
 
272
- if (isset($this->text))
273
- {
274
- $string .= " text: (" . $this->text . ")";
275
  }
276
 
277
  $string .= " HDOM_INNER_INFO: '";
278
- if (isset($node->_[HDOM_INFO_INNER]))
279
- {
280
  $string .= $node->_[HDOM_INFO_INNER] . "'";
281
- }
282
- else
283
- {
284
  $string .= ' NULL ';
285
  }
286
 
287
- $string .= " children: " . count($this->children);
288
- $string .= " nodes: " . count($this->nodes);
289
- $string .= " tag_start: " . $this->tag_start;
290
  $string .= "\n";
291
 
292
- if ($echo)
293
- {
294
  echo $string;
295
  return;
296
- }
297
- else
298
- {
299
  return $string;
300
  }
301
  }
@@ -307,12 +366,12 @@ class simple_html_dom_node
307
  * the current parent node.
308
  * @return object|null The parent node
309
  */
310
- function parent($parent=null)
311
  {
312
  // I am SURE that this doesn't work properly.
313
- // It fails to unset the current node from it's current parents nodes or children list first.
314
- if ($parent !== null)
315
- {
316
  $this->parent = $parent;
317
  $this->parent->nodes[] = $this;
318
  $this->parent->children[] = $this;
@@ -337,16 +396,16 @@ class simple_html_dom_node
337
  * @return object|array|null The child node at the specified index, all child
338
  * nodes or null if the index is invalid.
339
  */
340
- function children($idx=-1)
341
  {
342
- if ($idx===-1)
343
- {
344
  return $this->children;
345
  }
346
- if (isset($this->children[$idx]))
347
- {
348
  return $this->children[$idx];
349
  }
 
350
  return null;
351
  }
352
 
@@ -361,8 +420,7 @@ class simple_html_dom_node
361
  */
362
  function first_child()
363
  {
364
- if (count($this->children)>0)
365
- {
366
  return $this->children[0];
367
  }
368
  return null;
@@ -378,9 +436,8 @@ class simple_html_dom_node
378
  */
379
  function last_child()
380
  {
381
- if (($count=count($this->children))>0)
382
- {
383
- return $this->children[$count-1];
384
  }
385
  return null;
386
  }
@@ -393,21 +450,21 @@ class simple_html_dom_node
393
  */
394
  function next_sibling()
395
  {
396
- if ($this->parent===null)
397
- {
398
  return null;
399
  }
400
 
401
  $idx = 0;
402
  $count = count($this->parent->children);
403
- while ($idx<$count && $this!==$this->parent->children[$idx])
404
- {
405
  ++$idx;
406
  }
407
- if (++$idx>=$count)
408
- {
409
  return null;
410
  }
 
411
  return $this->parent->children[$idx];
412
  }
413
 
@@ -419,12 +476,17 @@ class simple_html_dom_node
419
  */
420
  function prev_sibling()
421
  {
422
- if ($this->parent===null) return null;
 
423
  $idx = 0;
424
  $count = count($this->parent->children);
425
- while ($idx<$count && $this!==$this->parent->children[$idx])
 
426
  ++$idx;
427
- if (--$idx<0) return null;
 
 
 
428
  return $this->parent->children[$idx];
429
  }
430
 
@@ -446,16 +508,18 @@ class simple_html_dom_node
446
  // Start by including ourselves in the comparison.
447
  $returnDom = $this;
448
 
449
- while (!is_null($returnDom))
450
- {
451
- if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
 
452
 
453
- if ($returnDom->tag == $tag)
454
- {
455
  break;
456
  }
 
457
  $returnDom = $returnDom->parent;
458
  }
 
459
  return $returnDom;
460
  }
461
 
@@ -466,12 +530,20 @@ class simple_html_dom_node
466
  */
467
  function innertext()
468
  {
469
- if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
470
- if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 
 
 
 
 
471
 
472
  $ret = '';
473
- foreach ($this->nodes as $n)
 
474
  $ret .= $n->outertext();
 
 
475
  return $ret;
476
  }
477
 
@@ -483,59 +555,61 @@ class simple_html_dom_node
483
  function outertext()
484
  {
485
  global $debug_object;
486
- if (is_object($debug_object))
487
- {
488
  $text = '';
489
- if ($this->tag == 'text')
490
- {
491
- if (!empty($this->text))
492
- {
493
- $text = " with text: " . $this->text;
494
  }
495
  }
 
496
  $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
497
  }
498
 
499
- if ($this->tag==='root') return $this->innertext();
500
 
501
  // trigger callback
502
- if ($this->dom && $this->dom->callback!==null)
503
- {
504
  call_user_func_array($this->dom->callback, array($this));
505
  }
506
 
507
- if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
508
- if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 
 
 
 
 
509
 
510
  // render begin tag
511
- if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
512
- {
513
  $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
514
  } else {
515
- $ret = "";
516
  }
517
 
518
  // render inner text
519
- if (isset($this->_[HDOM_INFO_INNER]))
520
- {
521
- // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
522
- if ($this->tag != "br")
523
- {
524
  $ret .= $this->_[HDOM_INFO_INNER];
525
  }
526
  } else {
527
- if ($this->nodes)
528
- {
529
- foreach ($this->nodes as $n)
530
- {
531
  $ret .= $this->convert_text($n->outertext());
532
  }
533
  }
534
  }
535
 
536
  // render end tag
537
- if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
538
- $ret .= '</'.$this->tag.'>';
 
 
539
  return $ret;
540
  }
541
 
@@ -546,35 +620,39 @@ class simple_html_dom_node
546
  */
547
  function text()
548
  {
549
- if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
550
- switch ($this->nodetype)
551
- {
 
 
552
  case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
553
  case HDOM_TYPE_COMMENT: return '';
554
  case HDOM_TYPE_UNKNOWN: return '';
555
  }
556
- if (strcasecmp($this->tag, 'script')===0) return '';
557
- if (strcasecmp($this->tag, 'style')===0) return '';
 
558
 
559
  $ret = '';
560
- // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
561
- // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
 
 
 
562
  // WHY is this happening?
563
- if (!is_null($this->nodes))
564
- {
565
- foreach ($this->nodes as $n)
566
- {
567
  // Start paragraph after a blank line
568
- if ($n->tag == 'p')
569
- {
570
  $ret .= "\n\n";
571
  }
572
 
573
  $ret .= $this->convert_text($n->text());
574
 
575
- // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
576
- if ($n->tag == "span")
577
- {
 
578
  $ret .= $this->dom->default_span_text;
579
  }
580
  }
@@ -599,72 +677,91 @@ class simple_html_dom_node
599
  function makeup()
600
  {
601
  // text, comment, unknown
602
- if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 
 
603
 
604
- $ret = '<'.$this->tag;
605
  $i = -1;
606
 
607
- foreach ($this->attr as $key=>$val)
608
- {
609
  ++$i;
610
 
611
  // skip removed attribute
612
- if ($val===null || $val===false)
613
- continue;
614
 
615
  $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
 
616
  //no value attr: nowrap, checked selected...
617
- if ($val===true)
618
  $ret .= $key;
619
- else {
620
  switch ($this->_[HDOM_INFO_QUOTE][$i])
621
  {
622
  case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
623
  case HDOM_QUOTE_SINGLE: $quote = '\''; break;
624
  default: $quote = '';
625
  }
626
- $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
 
 
 
 
 
 
 
627
  }
628
  }
 
629
  $ret = $this->dom->restore_noise($ret);
630
  return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
631
  }
632
 
633
- // find elements by css selector
634
- //PaperG - added ability for find to lowercase the value of the selector.
635
- function find($selector, $idx=null, $lowercase=false)
 
 
 
 
 
 
 
 
 
 
636
  {
637
  $selectors = $this->parse_selector($selector);
638
- if (($count=count($selectors))===0) return array();
639
  $found_keys = array();
640
 
641
  // find each selector
642
- for ($c=0; $c<$count; ++$c)
643
- {
644
- // The change on the below line was documented on the sourceforge code tracker id 2788009
645
  // used to be: if (($levle=count($selectors[0]))===0) return array();
646
- if (($levle=count($selectors[$c]))===0) return array();
647
- if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
648
 
649
- $head = array($this->_[HDOM_INFO_BEGIN]=>1);
 
650
 
651
  // handle descendant selectors, no recursive!
652
- for ($l=0; $l<$levle; ++$l)
653
- {
654
  $ret = array();
655
- foreach ($head as $k=>$v)
656
- {
657
- $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
658
  //PaperG - Pass this optional parameter on to the seek function.
659
- $n->seek($selectors[$c][$l], $ret, $lowercase);
660
  }
 
661
  $head = $ret;
 
662
  }
663
 
664
- foreach ($head as $k=>$v)
665
- {
666
- if (!isset($found_keys[$k]))
667
- {
668
  $found_keys[$k] = 1;
669
  }
670
  }
@@ -674,192 +771,476 @@ class simple_html_dom_node
674
  ksort($found_keys);
675
 
676
  $found = array();
677
- foreach ($found_keys as $k=>$v)
678
  $found[] = $this->dom->nodes[$k];
 
679
 
680
  // return nth-element or array
681
- if (is_null($idx)) return $found;
682
- else if ($idx<0) $idx = count($found) + $idx;
683
  return (isset($found[$idx])) ? $found[$idx] : null;
684
  }
685
 
686
- // seek for given conditions
687
- // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
688
- protected function seek($selector, &$ret, $lowercase=false)
 
 
 
 
 
 
 
 
 
 
 
689
  {
690
  global $debug_object;
691
  if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
692
 
693
- list($tag, $key, $val, $exp, $no_key) = $selector;
694
-
695
- // xpath index
696
- if ($tag && $key && is_numeric($key))
697
- {
698
- $count = 0;
699
- foreach ($this->children as $c)
700
- {
701
- if ($tag==='*' || $tag===$c->tag) {
702
- if (++$count==$key) {
703
- $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
704
- return;
705
- }
706
  }
 
707
  }
708
- return;
709
- }
710
 
711
- $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
712
- if ($end==0) {
713
- $parent = $this->parent;
714
- while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
715
- $end -= 1;
716
- $parent = $parent->parent;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
  }
718
- $end += $parent->_[HDOM_INFO_END];
719
- }
720
 
721
- for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
722
- $node = $this->dom->nodes[$i];
 
 
723
 
724
- $pass = true;
 
 
 
725
 
726
- if ($tag==='*' && !$key) {
727
- if (in_array($node, $this->children, true))
728
- $ret[$i] = 1;
729
- continue;
730
  }
731
 
732
- // compare tag
733
- if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
734
- // compare key
735
- if ($pass && $key) {
736
- if ($no_key) {
737
- if (isset($node->attr[$key])) $pass=false;
738
- } else {
739
- if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
740
- }
741
  }
742
- // compare value
743
- if ($pass && $key && $val && $val!=='*') {
744
- // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
745
- if ($key == "plaintext") {
746
- // $node->plaintext actually returns $node->text();
747
- $nodeKeyValue = $node->text();
748
- } else {
749
- // this is a normal search, we want the value of that attribute of the tag.
750
- $nodeKeyValue = $node->attr[$key];
751
- }
752
- if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
753
 
754
- //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
755
- if ($lowercase) {
756
- $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
 
 
 
 
 
 
 
 
 
 
 
 
757
  } else {
758
- $check = $this->match($exp, $val, $nodeKeyValue);
759
  }
760
- if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
761
-
762
- // handle multiple class
763
- if (!$check && strcasecmp($key, 'class')===0) {
764
- foreach (explode(' ',$node->attr[$key]) as $k) {
765
- // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
766
- if (!empty($k)) {
767
- if ($lowercase) {
768
- $check = $this->match($exp, strtolower($val), strtolower($k));
769
- } else {
770
- $check = $this->match($exp, $val, $k);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
  }
772
- if ($check) break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773
  }
774
  }
775
- }
776
- if (!$check) $pass = false;
777
  }
778
- if ($pass) $ret[$i] = 1;
 
 
779
  unset($node);
780
  }
781
  // It's passed by reference so this is actually what this function returns.
782
- if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
 
 
783
  }
784
 
785
- protected function match($exp, $pattern, $value) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  global $debug_object;
787
  if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
788
 
 
 
 
 
 
789
  switch ($exp) {
790
  case '=':
791
- return ($value===$pattern);
792
  case '!=':
793
- return ($value!==$pattern);
794
  case '^=':
795
- return preg_match("/^".preg_quote($pattern,'/')."/", $value);
796
  case '$=':
797
- return preg_match("/".preg_quote($pattern,'/')."$/", $value);
798
  case '*=':
799
- if ($pattern[0]=='/') {
800
- return preg_match($pattern, $value);
801
- }
802
- return preg_match("/".$pattern."/i", $value);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  }
804
  return false;
805
  }
806
 
807
- protected function parse_selector($selector_string) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
  global $debug_object;
809
- if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
810
 
811
- // pattern of CSS selectors, modified from mootools
812
- // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
813
- // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
814
- // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
815
- // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
816
- // farther study is required to determine of this should be documented or removed.
817
- // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
818
- $pattern = "/([\w:\*-]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
819
- preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
820
- if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
821
 
822
  $selectors = array();
823
  $result = array();
824
- //print_r($matches);
825
 
826
  foreach ($matches as $m) {
827
  $m[0] = trim($m[0]);
828
- if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
829
- // for browser generated xpath
830
- if ($m[1]==='tbody') continue;
831
-
832
- list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
833
- if (!empty($m[2])) {$key='id'; $val=$m[2];}
834
- if (!empty($m[3])) {$key='class'; $val=$m[3];}
835
- if (!empty($m[4])) {$key=$m[4];}
836
- if (!empty($m[5])) {$exp=$m[5];}
837
- if (!empty($m[6])) {$val=$m[6];}
838
-
839
- // convert to lowercase
840
- if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
841
- //elements that do NOT have the specified attribute
842
- if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
843
-
844
- $result[] = array($tag, $key, $val, $exp, $no_key);
845
- if (trim($m[7])===',') {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
846
  $selectors[] = $result;
847
  $result = array();
848
  }
849
  }
850
- if (count($result)>0)
851
- $selectors[] = $result;
852
  return $selectors;
853
  }
854
 
855
  function __get($name)
856
  {
857
- if (isset($this->attr[$name]))
858
- {
859
  return $this->convert_text($this->attr[$name]);
860
  }
861
- switch ($name)
862
- {
863
  case 'outertext': return $this->outertext();
864
  case 'innertext': return $this->innertext();
865
  case 'plaintext': return $this->text();
@@ -871,27 +1252,28 @@ class simple_html_dom_node
871
  function __set($name, $value)
872
  {
873
  global $debug_object;
874
- if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
875
 
876
- switch ($name)
877
- {
878
  case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
879
  case 'innertext':
880
- if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
 
 
881
  return $this->_[HDOM_INFO_INNER] = $value;
882
  }
883
- if (!isset($this->attr[$name]))
884
- {
885
  $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
886
  $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
887
  }
 
888
  $this->attr[$name] = $value;
889
  }
890
 
891
  function __isset($name)
892
  {
893
- switch ($name)
894
- {
895
  case 'outertext': return true;
896
  case 'innertext': return true;
897
  case 'plaintext': return true;
@@ -900,51 +1282,56 @@ class simple_html_dom_node
900
  return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
901
  }
902
 
903
- function __unset($name) {
904
- if (isset($this->attr[$name]))
905
- unset($this->attr[$name]);
906
  }
907
 
908
- // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
 
909
  function convert_text($text)
910
  {
911
  global $debug_object;
912
- if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
913
 
914
  $converted_text = $text;
915
 
916
- $sourceCharset = "";
917
- $targetCharset = "";
918
 
919
- if ($this->dom)
920
- {
921
  $sourceCharset = strtoupper($this->dom->_charset);
922
  $targetCharset = strtoupper($this->dom->_target_charset);
923
  }
924
- if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
925
 
926
- if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
927
- {
 
 
 
 
 
 
 
 
 
 
928
  // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
929
- if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
930
- {
931
  $converted_text = $text;
932
- }
933
- else
934
- {
935
  $converted_text = iconv($sourceCharset, $targetCharset, $text);
936
  }
937
  }
938
 
939
  // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
940
- if ($targetCharset == 'UTF-8')
941
- {
942
- if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
943
- {
944
  $converted_text = substr($converted_text, 3);
945
  }
946
- if (substr($converted_text, -3) == "\xef\xbb\xbf")
947
- {
948
  $converted_text = substr($converted_text, 0, -3);
949
  }
950
  }
@@ -960,48 +1347,40 @@ class simple_html_dom_node
960
  */
961
  static function is_utf8($str)
962
  {
963
- $c=0; $b=0;
964
- $bits=0;
965
- $len=strlen($str);
966
- for($i=0; $i<$len; $i++)
967
- {
968
- $c=ord($str[$i]);
969
- if($c > 128)
970
- {
971
- if(($c >= 254)) return false;
972
- elseif($c >= 252) $bits=6;
973
- elseif($c >= 248) $bits=5;
974
- elseif($c >= 240) $bits=4;
975
- elseif($c >= 224) $bits=3;
976
- elseif($c >= 192) $bits=2;
977
- else return false;
978
- if(($i+$bits) > $len) return false;
979
- while($bits > 1)
980
- {
981
  $i++;
982
- $b=ord($str[$i]);
983
- if($b < 128 || $b > 191) return false;
984
  $bits--;
985
  }
986
  }
987
  }
988
  return true;
989
  }
990
- /*
991
- function is_utf8($string)
992
- {
993
- //this is buggy
994
- return (utf8_encode(utf8_decode($string)) == $string);
995
- }
996
- */
997
 
998
  /**
999
- * Function to try a few tricks to determine the displayed size of an img on the page.
1000
- * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
 
1001
  *
1002
  * @author John Schlick
1003
  * @version April 19 2012
1004
- * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
 
1005
  */
1006
  function get_display_size()
1007
  {
@@ -1010,57 +1389,54 @@ class simple_html_dom_node
1010
  $width = -1;
1011
  $height = -1;
1012
 
1013
- if ($this->tag !== 'img')
1014
- {
1015
  return false;
1016
  }
1017
 
1018
  // See if there is aheight or width attribute in the tag itself.
1019
- if (isset($this->attr['width']))
1020
- {
1021
  $width = $this->attr['width'];
1022
  }
1023
 
1024
- if (isset($this->attr['height']))
1025
- {
1026
  $height = $this->attr['height'];
1027
  }
1028
 
1029
  // Now look for an inline style.
1030
- if (isset($this->attr['style']))
1031
- {
1032
  // Thanks to user gnarf from stackoverflow for this regular expression.
1033
  $attributes = array();
1034
- preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
 
 
 
 
 
 
 
1035
  foreach ($matches as $match) {
1036
- $attributes[$match[1]] = $match[2];
1037
  }
1038
 
1039
  // If there is a width in the style attributes:
1040
- if (isset($attributes['width']) && $width == -1)
1041
- {
1042
  // check that the last two characters are px (pixels)
1043
- if (strtolower(substr($attributes['width'], -2)) == 'px')
1044
- {
1045
  $proposed_width = substr($attributes['width'], 0, -2);
1046
  // Now make sure that it's an integer and not something stupid.
1047
- if (filter_var($proposed_width, FILTER_VALIDATE_INT))
1048
- {
1049
  $width = $proposed_width;
1050
  }
1051
  }
1052
  }
1053
 
1054
  // If there is a width in the style attributes:
1055
- if (isset($attributes['height']) && $height == -1)
1056
- {
1057
  // check that the last two characters are px (pixels)
1058
- if (strtolower(substr($attributes['height'], -2)) == 'px')
1059
- {
1060
  $proposed_height = substr($attributes['height'], 0, -2);
1061
  // Now make sure that it's an integer and not something stupid.
1062
- if (filter_var($proposed_height, FILTER_VALIDATE_INT))
1063
- {
1064
  $height = $proposed_height;
1065
  }
1066
  }
@@ -1069,47 +1445,132 @@ class simple_html_dom_node
1069
  }
1070
 
1071
  // Future enhancement:
1072
- // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
 
1073
 
1074
  // Far future enhancement
1075
- // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
1076
- // Note that in this case, the class or id will have the img subselector for it to apply to the image.
 
 
1077
 
1078
  // ridiculously far future development
1079
- // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
 
 
 
 
 
 
 
1080
 
1081
- $result = array('height' => $height,
1082
- 'width' => $width);
1083
  return $result;
1084
  }
1085
 
1086
  // camel naming conventions
1087
- function getAllAttributes() {return $this->attr;}
1088
- function getAttribute($name) {return $this->__get($name);}
1089
- function setAttribute($name, $value) {$this->__set($name, $value);}
1090
- function hasAttribute($name) {return $this->__isset($name);}
1091
- function removeAttribute($name) {$this->__set($name, null);}
1092
- function getElementById($id) {return $this->find("#$id", 0);}
1093
- function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1094
- function getElementByTagName($name) {return $this->find($name, 0);}
1095
- function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
1096
- function parentNode() {return $this->parent();}
1097
- function childNodes($idx=-1) {return $this->children($idx);}
1098
- function firstChild() {return $this->first_child();}
1099
- function lastChild() {return $this->last_child();}
1100
- function nextSibling() {return $this->next_sibling();}
1101
- function previousSibling() {return $this->prev_sibling();}
1102
- function hasChildNodes() {return $this->has_child();}
1103
- function nodeName() {return $this->tag;}
1104
- function appendChild($node) {$node->parent($this); return $node;}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1105
 
1106
  }
1107
 
1108
  /**
1109
  * simple html dom parser
1110
- * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
 
 
 
1111
  * Paperg - change $size from protected to public so we can easily access it
1112
- * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
 
 
1113
  *
1114
  * @package PlaceLocalInclude
1115
  */
@@ -1185,7 +1646,8 @@ class simple_html_dom
1185
  * Holds the current character at position {@see simple_html_dom::$pos} in
1186
  * the document {@see simple_html_dom::$doc}
1187
  *
1188
- * _Note_: Using this variable is more efficient than calling `substr($doc, $pos, 1)`
 
1189
  *
1190
  * @var string
1191
  */
@@ -1232,7 +1694,8 @@ class simple_html_dom
1232
  */
1233
  protected $token_attr = ' >';
1234
 
1235
- // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
 
1236
  public $_charset = '';
1237
  public $_target_charset = '';
1238
 
@@ -1241,14 +1704,14 @@ class simple_html_dom
1241
  *
1242
  * @var string
1243
  */
1244
- protected $default_br_text = "";
1245
 
1246
  /**
1247
  * Suffix for <span> elements
1248
  *
1249
  * @var string
1250
  */
1251
- public $default_span_text = "";
1252
 
1253
  /**
1254
  * Defines a list of self-closing tags (Void elements) according to the HTML
@@ -1263,20 +1726,20 @@ class simple_html_dom
1263
  * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1264
  */
1265
  protected $self_closing_tags = array(
1266
- 'area'=>1,
1267
- 'base'=>1,
1268
- 'br'=>1,
1269
- 'col'=>1,
1270
- 'embed'=>1,
1271
- 'hr'=>1,
1272
- 'img'=>1,
1273
- 'input'=>1,
1274
- 'link'=>1,
1275
- 'meta'=>1,
1276
- 'param'=>1,
1277
- 'source'=>1,
1278
- 'track'=>1,
1279
- 'wbr'=>1
1280
  );
1281
 
1282
  /**
@@ -1291,12 +1754,12 @@ class simple_html_dom
1291
  * - Sort elements by name for better readability!
1292
  */
1293
  protected $block_tags = array(
1294
- 'body'=>1,
1295
- 'div'=>1,
1296
- 'form'=>1,
1297
- 'root'=>1,
1298
- 'span'=>1,
1299
- 'table'=>1
1300
  );
1301
 
1302
  /**
@@ -1355,38 +1818,55 @@ class simple_html_dom
1355
  * the document.
1356
  */
1357
  protected $optional_closing_tags = array(
1358
- 'b'=>array('b'=>1), // Not optional, see https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1359
- 'dd'=>array('dd'=>1, 'dt'=>1),
1360
- 'dl'=>array('dd'=>1, 'dt'=>1), // Not optional, see https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1361
- 'dt'=>array('dd'=>1, 'dt'=>1),
1362
- 'li'=>array('li'=>1),
1363
- 'optgroup'=>array('optgroup'=>1, 'option'=>1),
1364
- 'option'=>array('optgroup'=>1, 'option'=>1),
1365
- 'p'=>array('p'=>1),
1366
- 'rp'=>array('rp'=>1, 'rt'=>1),
1367
- 'rt'=>array('rp'=>1, 'rt'=>1),
1368
- 'td'=>array('td'=>1, 'th'=>1),
1369
- 'th'=>array('td'=>1, 'th'=>1),
1370
- 'tr'=>array('td'=>1, 'th'=>1, 'tr'=>1),
 
 
 
 
1371
  );
1372
 
1373
- function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0)
 
 
 
 
 
 
 
 
1374
  {
1375
- if ($str)
1376
- {
1377
- if (preg_match("/^http:\/\//i",$str) || is_file($str))
1378
- {
1379
  $this->load_file($str);
1380
- }
1381
- else
1382
- {
1383
- $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options);
 
 
 
 
 
1384
  }
1385
  }
1386
- // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
 
1387
  if (!$forceTagsClosed) {
1388
- $this->optional_closing_array=array();
1389
  }
 
1390
  $this->_target_charset = $target_charset;
1391
  }
1392
 
@@ -1396,7 +1876,13 @@ class simple_html_dom
1396
  }
1397
 
1398
  // load html from string
1399
- function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0)
 
 
 
 
 
 
1400
  {
1401
  global $debug_object;
1402
 
@@ -1411,8 +1897,8 @@ class simple_html_dom
1411
 
1412
  // strip out the \r \n's if we are told to.
1413
  if ($stripRN) {
1414
- $this->doc = str_replace("\r", " ", $this->doc);
1415
- $this->doc = str_replace("\n", " ", $this->doc);
1416
 
1417
  // set the length of content since we have changed it.
1418
  $this->size = strlen($this->doc);
@@ -1442,7 +1928,6 @@ class simple_html_dom
1442
 
1443
  // make load function chainable
1444
  return $this;
1445
-
1446
  }
1447
 
1448
  // load html from file
@@ -1450,7 +1935,7 @@ class simple_html_dom
1450
  {
1451
  $args = func_get_args();
1452
 
1453
- if($doc = call_user_func_array('file_get_contents', $args) !== false) {
1454
  $this->load($doc, true);
1455
  } else {
1456
  return false;
@@ -1480,16 +1965,16 @@ class simple_html_dom
1480
  }
1481
 
1482
  // save dom as string
1483
- function save($filepath='')
1484
  {
1485
  $ret = $this->root->innertext();
1486
- if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1487
  return $ret;
1488
  }
1489
 
1490
  // find dom node by css selector
1491
  // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1492
- function find($selector, $idx=null, $lowercase=false)
1493
  {
1494
  return $this->root->find($selector, $idx, $lowercase);
1495
  }
@@ -1497,28 +1982,49 @@ class simple_html_dom
1497
  // clean up memory due to php5 circular references memory leak...
1498
  function clear()
1499
  {
1500
- foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1501
- // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1502
- if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1503
- if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1504
- if (isset($this->root)) {$this->root->clear(); unset($this->root);}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1505
  unset($this->doc);
1506
  unset($this->noise);
1507
  }
1508
 
1509
- function dump($show_attr=true)
1510
  {
1511
  $this->root->dump($show_attr);
1512
  }
1513
 
1514
  // prepare HTML data and init everything
1515
- protected function prepare($str, $lowercase=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
 
 
 
1516
  {
1517
  $this->clear();
1518
 
1519
  $this->doc = trim($str);
1520
  $this->size = strlen($this->doc);
1521
- $this->original_size = $this->size; // Save the original size of the html that we got in. It might be useful to someone.
1522
  $this->pos = 0;
1523
  $this->cursor = 1;
1524
  $this->noise = array();
@@ -1531,7 +2037,7 @@ class simple_html_dom
1531
  $this->root->_[HDOM_INFO_BEGIN] = -1;
1532
  $this->root->nodetype = HDOM_TYPE_ROOT;
1533
  $this->parent = $this->root;
1534
- if ($this->size>0) $this->char = $this->doc[0];
1535
  }
1536
 
1537
  /**
@@ -1544,8 +2050,7 @@ class simple_html_dom
1544
  while (true) {
1545
  // Read next tag if there is no text between current position and the
1546
  // next opening tag.
1547
- if (($s = $this->copy_until_char('<'))==='')
1548
- {
1549
  if($this->read_tag()) {
1550
  continue;
1551
  } else {
@@ -1561,80 +2066,122 @@ class simple_html_dom
1561
  }
1562
  }
1563
 
1564
- // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1565
- // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1566
- // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
 
 
 
 
1567
  protected function parse_charset()
1568
  {
1569
  global $debug_object;
1570
 
1571
  $charset = null;
1572
 
1573
- if (function_exists('get_last_retrieve_url_contents_content_type'))
1574
- {
1575
  $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1576
  $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1577
- if ($success)
1578
- {
1579
  $charset = $matches[1];
1580
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
 
 
 
 
 
1581
  }
1582
-
1583
  }
1584
 
1585
- if (empty($charset))
1586
- {
1587
- $el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1588
- if (!empty($el))
1589
- {
1590
  $fullvalue = $el->content;
1591
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
 
 
 
 
 
1592
 
1593
- if (!empty($fullvalue))
1594
- {
1595
- $success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1596
- if ($success)
1597
- {
 
 
 
1598
  $charset = $matches[1];
1599
- }
1600
- else
1601
- {
1602
- // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1603
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
 
 
 
 
 
1604
  $charset = 'ISO-8859-1';
1605
  }
1606
  }
1607
  }
1608
  }
1609
 
1610
- // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1611
- if (empty($charset))
1612
- {
1613
- // Use this in case mb_detect_charset isn't installed/loaded on this machine.
 
1614
  $charset = false;
1615
- if (function_exists('mb_detect_encoding'))
1616
- {
1617
  // Have php try to detect the encoding from the text given to us.
1618
- $charset = mb_detect_encoding($this->doc . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1619
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
 
 
 
 
 
 
1620
  }
1621
 
1622
- // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1623
- if ($charset === false)
1624
- {
1625
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
 
 
 
 
 
 
 
1626
  $charset = 'UTF-8';
1627
  }
1628
  }
1629
 
1630
- // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1631
- if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1632
- {
1633
- if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
 
 
 
 
 
 
 
 
 
1634
  $charset = 'CP1252';
1635
  }
1636
 
1637
- if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
 
 
1638
 
1639
  return $this->_charset = $charset;
1640
  }
@@ -1647,88 +2194,100 @@ class simple_html_dom
1647
  protected function read_tag()
1648
  {
1649
  // Set end position if no further tags found
1650
- if ($this->char!=='<')
1651
- {
1652
  $this->root->_[HDOM_INFO_END] = $this->cursor;
1653
  return false;
1654
  }
 
1655
  $begin_tag_pos = $this->pos;
1656
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1657
 
1658
  // end tag
1659
- if ($this->char==='/')
1660
- {
1661
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1662
 
1663
  // Skip whitespace in end tags (i.e. in "</ html>")
1664
  $this->skip($this->token_blank);
1665
  $tag = $this->copy_until_char('>');
1666
 
1667
  // Skip attributes in end tags
1668
- if (($pos = strpos($tag, ' '))!==false)
1669
  $tag = substr($tag, 0, $pos);
 
1670
 
1671
  $parent_lower = strtolower($this->parent->tag);
1672
  $tag_lower = strtolower($tag);
1673
 
1674
  // The end tag is supposed to close the parent tag. Handle situations
1675
  // when it doesn't
1676
- if ($parent_lower!==$tag_lower)
1677
- {
1678
  // Parent tag does not have to be closed necessarily (optional closing tag)
1679
  // Current tag is a block tag, so it may close an ancestor
1680
- if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1681
- {
 
1682
  $this->parent->_[HDOM_INFO_END] = 0;
1683
  $org_parent = $this->parent;
1684
 
1685
  // Traverse ancestors to find a matching opening tag
1686
  // Stop at root node
1687
- while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
 
 
1688
  $this->parent = $this->parent->parent;
 
1689
 
1690
  // If we don't have a match add current tag as text node
1691
- if (strtolower($this->parent->tag)!==$tag_lower) {
1692
  $this->parent = $org_parent; // restore origonal parent
1693
- if ($this->parent->parent) $this->parent = $this->parent->parent;
 
 
 
 
1694
  $this->parent->_[HDOM_INFO_END] = $this->cursor;
1695
  return $this->as_text_node($tag);
1696
  }
1697
- }
1698
- // Grandparent exists and current tag is a block tag, so our parent doesn't have an end tag
1699
- else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1700
- {
 
1701
  $this->parent->_[HDOM_INFO_END] = 0; // No end tag
1702
  $org_parent = $this->parent;
1703
 
1704
  // Traverse ancestors to find a matching opening tag
1705
  // Stop at root node
1706
- while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
 
 
1707
  $this->parent = $this->parent->parent;
 
1708
 
1709
  // If we don't have a match add current tag as text node
1710
- if (strtolower($this->parent->tag)!==$tag_lower)
1711
- {
1712
  $this->parent = $org_parent; // restore origonal parent
1713
  $this->parent->_[HDOM_INFO_END] = $this->cursor;
1714
  return $this->as_text_node($tag);
1715
  }
1716
- }
1717
- // Grandparent exists and current tag closes it
1718
- else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1719
- {
1720
  $this->parent->_[HDOM_INFO_END] = 0;
1721
  $this->parent = $this->parent->parent;
1722
- }
1723
- else // Random tag, add as text node
1724
  return $this->as_text_node($tag);
 
1725
  }
1726
 
1727
  // Set end position of parent tag to current cursor position
1728
  $this->parent->_[HDOM_INFO_END] = $this->cursor;
1729
- if ($this->parent->parent) $this->parent = $this->parent->parent;
1730
 
1731
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
 
 
 
 
1732
  return true;
1733
  }
1734
 
@@ -1743,25 +2302,27 @@ class simple_html_dom
1743
  // <!DOCTYPE html>
1744
  // <![CDATA[ ... ]]>
1745
  // <!-- Comment -->
1746
- if (isset($tag[0]) && $tag[0]==='!') {
1747
  $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1748
 
1749
- if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { // Comment ("<!--")
1750
  $node->nodetype = HDOM_TYPE_COMMENT;
1751
  $node->tag = 'comment';
1752
  } else { // Could be doctype or CDATA but we don't care
1753
  $node->nodetype = HDOM_TYPE_UNKNOWN;
1754
  $node->tag = 'unknown';
1755
  }
1756
- if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
 
 
1757
  $this->link_nodes($node, true);
1758
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1759
  return true;
1760
  }
1761
 
1762
  // The start tag cannot contain another start tag, if so add as text
1763
  // i.e. "<<html>"
1764
- if ($pos=strpos($tag, '<')!==false) {
1765
  $tag = '<' . substr($tag, 0, -1);
1766
  $node->_[HDOM_INFO_TEXT] = $tag;
1767
  $this->link_nodes($node, false);
@@ -1770,19 +2331,19 @@ class simple_html_dom
1770
  }
1771
 
1772
  // Handle invalid tag names (i.e. "<html#doc>")
1773
- if (!preg_match("/^\w[\w:-]*$/", $tag)) {
1774
  $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1775
 
1776
  // Next char is the beginning of a new tag, don't touch it.
1777
- if ($this->char==='<') {
1778
  $this->link_nodes($node, false);
1779
  return true;
1780
  }
1781
 
1782
  // Next char closes current tag, add and be done with it.
1783
- if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1784
  $this->link_nodes($node, false);
1785
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1786
  return true;
1787
  }
1788
 
@@ -1792,11 +2353,9 @@ class simple_html_dom
1792
  $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1793
 
1794
  // handle optional closing tags
1795
- if (isset($this->optional_closing_tags[$tag_lower]) )
1796
- {
1797
  // Traverse ancestors to close all optional closing tags
1798
- while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1799
- {
1800
  $this->parent->_[HDOM_INFO_END] = 0;
1801
  $this->parent = $this->parent->parent;
1802
  }
@@ -1804,91 +2363,106 @@ class simple_html_dom
1804
  }
1805
 
1806
  $guard = 0; // prevent infinity loop
1807
- $space = array($this->copy_skip($this->token_blank), '', ''); // [0] Space between tag and first attribute
 
 
1808
 
1809
  // attributes
1810
- do
1811
- {
1812
  // Everything until the first equal sign should be the attribute name
1813
  $name = $this->copy_until($this->token_equal);
1814
 
1815
- if ($name==='' && $this->char!==null && $space[0]==='')
1816
- {
1817
  break;
1818
  }
1819
 
1820
- if ($guard===$this->pos) // Escape infinite loop
1821
- {
1822
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1823
  continue;
1824
  }
 
1825
  $guard = $this->pos;
1826
 
1827
  // handle endless '<'
1828
- if ($this->pos>=$this->size-1 && $this->char!=='>') { // Out of bounds before the tag ended
 
1829
  $node->nodetype = HDOM_TYPE_TEXT;
1830
  $node->_[HDOM_INFO_END] = 0;
1831
- $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1832
  $node->tag = 'text';
1833
  $this->link_nodes($node, false);
1834
  return true;
1835
  }
1836
 
1837
  // handle mismatch '<'
1838
- if ($this->doc[$this->pos-1]=='<') { // Attributes cannot start after opening tag
 
1839
  $node->nodetype = HDOM_TYPE_TEXT;
1840
  $node->tag = 'text';
1841
  $node->attr = array();
1842
  $node->_[HDOM_INFO_END] = 0;
1843
- $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
 
 
 
 
1844
  $this->pos -= 2;
1845
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1846
  $this->link_nodes($node, false);
1847
  return true;
1848
  }
1849
 
1850
- if ($name!=='/' && $name!=='') { // this is a attribute name
1851
- $space[1] = $this->copy_skip($this->token_blank); // [1] Whitespace after attribute name
 
 
1852
  $name = $this->restore_noise($name); // might be a noisy name
1853
- if ($this->lowercase) $name = strtolower($name);
1854
- if ($this->char==='=') { // attribute with value
1855
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
 
 
1856
  $this->parse_attr($node, $name, $space); // get attribute value
1857
- }
1858
- else {
1859
  //no value attr: nowrap, checked selected...
1860
  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1861
  $node->attr[$name] = true;
1862
- if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1863
  }
 
1864
  $node->_[HDOM_INFO_SPACE][] = $space;
1865
- $space = array($this->copy_skip($this->token_blank), '', ''); // prepare for next attribute
1866
- }
1867
- else // no more attributes
 
 
 
 
 
1868
  break;
1869
- } while ($this->char!=='>' && $this->char!=='/'); // go until the tag ended
 
1870
 
1871
  $this->link_nodes($node, true);
1872
  $node->_[HDOM_INFO_ENDSPACE] = $space[0];
1873
 
1874
  // handle empty tags (i.e. "<div/>")
1875
- if ($this->copy_until_char('>')==='/')
1876
- {
1877
  $node->_[HDOM_INFO_ENDSPACE] .= '/';
1878
  $node->_[HDOM_INFO_END] = 0;
1879
- }
1880
- else
1881
- {
1882
  // reset parent
1883
- if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
 
 
1884
  }
1885
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
 
1886
 
1887
  // If it's a BR tag, we need to set it's text to the default text.
1888
  // This way when we see it in plaintext, we can generate formatting that the user wants.
1889
  // since a br tag never has sub nodes, this works well.
1890
- if ($node->tag == "br")
1891
- {
1892
  $node->_[HDOM_INFO_INNER] = $this->default_br_text;
1893
  }
1894
 
@@ -1906,36 +2480,40 @@ class simple_html_dom
1906
  protected function parse_attr($node, $name, &$space)
1907
  {
1908
  // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1909
- // If the attribute is already defined inside a tag, only pay attention to the first one as opposed to the last one.
 
1910
  // https://stackoverflow.com/a/26341866
1911
- if (isset($node->attr[$name]))
1912
- {
1913
  return;
1914
  }
1915
 
1916
- $space[2] = $this->copy_skip($this->token_blank); // [2] Whitespace between "=" and the value
 
 
1917
  switch ($this->char) {
1918
  case '"': // value is anything between double quotes
1919
  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1920
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1921
  $node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
1922
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1923
  break;
1924
  case '\'': // value is anything between single quotes
1925
  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1926
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1927
  $node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
1928
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1929
  break;
1930
  default: // value is anything until the first space or end tag
1931
  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1932
  $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1933
  }
1934
- // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1935
- $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1936
- $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1937
- // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1938
- if ($name == "class") {
 
 
1939
  $node->attr[$name] = trim($node->attr[$name]);
1940
  }
1941
  }
@@ -1952,8 +2530,7 @@ class simple_html_dom
1952
  {
1953
  $node->parent = $this->parent;
1954
  $this->parent->nodes[] = $node;
1955
- if ($is_child)
1956
- {
1957
  $this->parent->children[] = $node;
1958
  }
1959
  }
@@ -1970,7 +2547,7 @@ class simple_html_dom
1970
  ++$this->cursor;
1971
  $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1972
  $this->link_nodes($node, false);
1973
- $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1974
  return true;
1975
  }
1976
 
@@ -1985,7 +2562,7 @@ class simple_html_dom
1985
  protected function skip($chars)
1986
  {
1987
  $this->pos += strspn($this->doc, $chars, $this->pos);
1988
- $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1989
  }
1990
 
1991
  /**
@@ -2001,8 +2578,8 @@ class simple_html_dom
2001
  $pos = $this->pos;
2002
  $len = strspn($this->doc, $chars, $pos);
2003
  $this->pos += $len;
2004
- $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
2005
- if ($len===0) return '';
2006
  return substr($this->doc, $pos, $len);
2007
  }
2008
 
@@ -2019,7 +2596,7 @@ class simple_html_dom
2019
  $pos = $this->pos;
2020
  $len = strcspn($this->doc, $chars, $pos);
2021
  $this->pos += $len;
2022
- $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
2023
  return substr($this->doc, $pos, $len);
2024
  }
2025
 
@@ -2033,20 +2610,21 @@ class simple_html_dom
2033
  */
2034
  protected function copy_until_char($char)
2035
  {
2036
- if ($this->char===null) return '';
2037
 
2038
- if (($pos = strpos($this->doc, $char, $this->pos))===false) {
2039
- $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
2040
  $this->char = null;
2041
  $this->pos = $this->size;
2042
  return $ret;
2043
  }
2044
 
2045
- if ($pos===$this->pos) return '';
 
2046
  $pos_old = $this->pos;
2047
  $this->char = $this->doc[$pos];
2048
  $this->pos = $pos;
2049
- return substr($this->doc, $pos_old, $pos-$pos_old);
2050
  }
2051
 
2052
  /**
@@ -2058,17 +2636,25 @@ class simple_html_dom
2058
  * @param bool $remove_tag True to remove the entire match. Default is false
2059
  * to only remove the captured data.
2060
  */
2061
- protected function remove_noise($pattern, $remove_tag=false)
2062
  {
2063
  global $debug_object;
2064
  if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2065
 
2066
- $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
 
 
 
 
 
 
 
 
 
 
 
 
2067
 
2068
- for ($i=$count-1; $i>-1; --$i)
2069
- {
2070
- $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
2071
- if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
2072
  $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2073
  $this->noise[$key] = $matches[$i][$idx][0];
2074
  $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
@@ -2076,8 +2662,8 @@ class simple_html_dom
2076
 
2077
  // reset the length of content
2078
  $this->size = strlen($this->doc);
2079
- if ($this->size>0)
2080
- {
2081
  $this->char = $this->doc[0];
2082
  }
2083
  }
@@ -2095,28 +2681,42 @@ class simple_html_dom
2095
  global $debug_object;
2096
  if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2097
 
2098
- while (($pos=strpos($text, '___noise___'))!==false)
2099
- {
2100
- // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
2101
- if (strlen($text) > $pos+15)
2102
- { // todo: "___noise___1000" (or any number with four or more digits) in the DOM causes an infinite loop which could be utilized by malicious software
2103
- $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
2104
- if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
2105
-
2106
- if (isset($this->noise[$key]))
2107
- {
2108
- $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
 
 
 
 
 
 
2109
  }
2110
- else
2111
- {
 
 
 
 
2112
  // do this to prevent an infinite loop.
2113
- $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
 
 
 
2114
  }
2115
- }
2116
- else
2117
- {
2118
- // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
2119
- $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
 
2120
  }
2121
  }
2122
  return $text;
@@ -2128,14 +2728,13 @@ class simple_html_dom
2128
  global $debug_object;
2129
  if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2130
 
2131
- foreach($this->noise as $noiseElement)
2132
- {
2133
- if (strpos($noiseElement, $text)!==false)
2134
- {
2135
  return $noiseElement;
2136
  }
2137
  }
2138
  }
 
2139
  function __toString()
2140
  {
2141
  return $this->root->innertext();
@@ -2143,8 +2742,7 @@ class simple_html_dom
2143
 
2144
  function __get($name)
2145
  {
2146
- switch ($name)
2147
- {
2148
  case 'outertext':
2149
  return $this->root->innertext();
2150
  case 'innertext':
@@ -2159,16 +2757,54 @@ class simple_html_dom
2159
  }
2160
 
2161
  // camel naming conventions
2162
- function childNodes($idx=-1) {return $this->root->childNodes($idx);}
2163
- function firstChild() {return $this->root->first_child();}
2164
- function lastChild() {return $this->root->last_child();}
2165
- function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
2166
- function createTextNode($value) {return @end(str_get_html($value)->nodes);}
2167
- function getElementById($id) {return $this->find("#$id", 0);}
2168
- function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
2169
- function getElementByTagName($name) {return $this->find($name, 0);}
2170
- function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
2171
- function loadFile() {$args = func_get_args();$this->load_file($args);}
2172
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2173
 
2174
- ?>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  /**
5
  * Website: http://sourceforge.net/projects/simplehtmldom/
6
+ * Additional projects: http://sourceforge.net/projects/debugobject/
7
  * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
8
  * Contributions by:
9
  * Yousuke Kumakura (Attribute filters)
13
  * all affected sections have comments starting with "PaperG"
14
  *
15
  * Paperg - Added case insensitive testing of the value of the selector.
16
+ *
17
+ * Paperg - Added tag_start for the starting index of tags - NOTE: This works
18
+ * but not accurately. This tag_start gets counted AFTER \r\n have been crushed
19
+ * out, and after the remove_noice calls so it will not reflect the REAL
20
+ * position of the tag in the source, it will almost always be smaller by some
21
+ * amount. We use this to determine how far into the file the tag in question
22
+ * is. This "percentage" will never be accurate as the $dom->size is the "real"
23
+ * number of bytes the dom was created from. But for most purposes, it's a
24
+ * really good estimation.
25
+ *
26
+ * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
27
+ * closed is great for malformed html, but it CAN lead to parsing errors.
28
+ *
29
  * Allow the user to tell us how much they trust the html.
 
 
 
 
 
 
30
  *
31
+ * Paperg add the text and plaintext to the selectors for the find syntax.
32
+ * plaintext implies text in the innertext of a node. text implies that the
33
+ * tag is a text node. This allows for us to find tags based on the text they
34
+ * contain.
35
+ *
36
+ * Create find_ancestor_tag to see if a tag is - at any level - inside of
37
+ * another specific tag.
38
+ *
39
+ * Paperg: added parse_charset so that we know about the character set of
40
+ * the source document. NOTE: If the user's system has a routine called
41
+ * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
42
+ * returning the content-type header from the last transfer or curl_exec, and
43
+ * we will parse that and use it in preference to any other method of charset
44
+ * detection.
45
+ *
46
+ * Found infinite loop in the case of broken html in restore_noise. Rewrote to
47
+ * protect from that.
48
+ *
49
  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
50
  *
51
  * Licensed under The MIT License
54
  * @author S.C. Chen <me578022@gmail.com>
55
  * @author John Schlick
56
  * @author Rus Carroll
57
+ * @version Rev. 1.8.1 (247)
58
  * @package PlaceLocalInclude
59
  * @subpackage simple_html_dom
60
  */
65
  */
66
  define('HDOM_TYPE_ELEMENT', 1);
67
  define('HDOM_TYPE_COMMENT', 2);
68
+ define('HDOM_TYPE_TEXT', 3);
69
+ define('HDOM_TYPE_ENDTAG', 4);
70
+ define('HDOM_TYPE_ROOT', 5);
71
  define('HDOM_TYPE_UNKNOWN', 6);
72
  define('HDOM_QUOTE_DOUBLE', 0);
73
  define('HDOM_QUOTE_SINGLE', 1);
74
+ define('HDOM_QUOTE_NO', 3);
75
+ define('HDOM_INFO_BEGIN', 0);
76
+ define('HDOM_INFO_END', 1);
77
+ define('HDOM_INFO_QUOTE', 2);
78
+ define('HDOM_INFO_SPACE', 3);
79
+ define('HDOM_INFO_TEXT', 4);
80
+ define('HDOM_INFO_INNER', 5);
81
+ define('HDOM_INFO_OUTER', 6);
82
+ define('HDOM_INFO_ENDSPACE', 7);
83
+
84
+ /** The default target charset */
85
+ defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
86
+
87
+ /** The default <br> text used instead of <br> tags when returning text */
88
+ defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
89
+
90
+ /** The default <span> text used instead of <span> tags when returning text */
91
+ defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
92
+
93
+ /** The maximum file size the parser should load */
94
+ defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
95
 
96
  /** Contents between curly braces "{" and "}" are interpreted as text */
97
  define('HDOM_SMARTY_AS_TEXT', 1);
100
  // -----------------------------------------------------------------------------
101
  // get html dom from file
102
  // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
103
+ function file_get_html(
104
+ $url,
105
+ $use_include_path = false,
106
+ $context = null,
107
+ $offset = 0,
108
+ $maxLen = -1,
109
+ $lowercase = true,
110
+ $forceTagsClosed = true,
111
+ $target_charset = DEFAULT_TARGET_CHARSET,
112
+ $stripRN = true,
113
+ $defaultBRText = DEFAULT_BR_TEXT,
114
+ $defaultSpanText = DEFAULT_SPAN_TEXT)
115
  {
116
  // Ensure maximum length is greater than zero
117
  if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
118
 
119
  // We DO force the tags to be terminated.
120
+ $dom = new simple_html_dom(
121
+ null,
122
+ $lowercase,
123
+ $forceTagsClosed,
124
+ $target_charset,
125
+ $stripRN,
126
+ $defaultBRText,
127
+ $defaultSpanText);
128
+
129
+ /**
130
+ * For sourceforge users: uncomment the next line and comment the
131
+ * retrieve_url_contents line 2 lines down if it is not already done.
132
+ */
133
+ $contents = file_get_contents(
134
+ $url,
135
+ $use_include_path,
136
+ $context,
137
+ $offset,
138
+ $maxLen);
139
+
140
+ // Paperg - use our own mechanism for getting the contents as we want to
141
+ // control the timeout.
142
+ // $contents = retrieve_url_contents($url);
143
+ if (empty($contents) || strlen($contents) > $maxLen) { return false; }
144
+
145
  // The second parameter can force the selectors to all be lowercase.
146
  $dom->load($contents, $lowercase, $stripRN);
147
  return $dom;
148
  }
149
 
150
  // get html dom from string
151
+ function str_get_html(
152
+ $str,
153
+ $lowercase = true,
154
+ $forceTagsClosed = true,
155
+ $target_charset = DEFAULT_TARGET_CHARSET,
156
+ $stripRN = true,
157
+ $defaultBRText = DEFAULT_BR_TEXT,
158
+ $defaultSpanText = DEFAULT_SPAN_TEXT)
159
  {
160
+ $dom = new simple_html_dom(
161
+ null,
162
+ $lowercase,
163
+ $forceTagsClosed,
164
+ $target_charset,
165
+ $stripRN,
166
+ $defaultBRText,
167
+ $defaultSpanText);
168
+
169
+ if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
170
  $dom->clear();
171
  return false;
172
  }
173
+
174
  $dom->load($str, $lowercase, $stripRN);
175
  return $dom;
176
  }
177
 
178
  // dump html dom tree
179
+ function dump_html_tree($node, $show_attr = true, $deep = 0)
180
  {
181
  $node->dump($node);
182
  }
183
 
 
184
  /**
185
  * simple html dom node
186
+ * PaperG - added ability for "find" routine to lowercase the value of the
187
+ * selector.
188
+ *
189
+ * PaperG - added $tag_start to track the start position of the tag in the total
190
+ * byte index
191
  *
192
  * @package PlaceLocalInclude
193
  */
281
  }
282
 
283
  // dump node's tree
284
+ function dump($show_attr = true, $deep = 0)
285
  {
286
  $lead = str_repeat(' ', $deep);
287
 
288
+ echo $lead . $this->tag;
289
+
290
+ if ($show_attr && count($this->attr) > 0) {
291
  echo '(';
292
+ foreach ($this->attr as $k => $v) {
293
+ echo "[$k]=>\"" . $this->$k . '", ';
294
+ }
295
  echo ')';
296
  }
297
+
298
  echo "\n";
299
 
300
+ if ($this->nodes) {
301
+ foreach ($this->nodes as $c) {
302
+ $c->dump($show_attr, $deep + 1);
 
 
303
  }
304
  }
305
  }
306
 
307
 
308
  // Debugging function to dump a single dom node with a bunch of information about it.
309
+ function dump_node($echo = true)
310
  {
 
311
  $string = $this->tag;
312
+
313
+ if (count($this->attr) > 0) {
314
  $string .= '(';
315
+ foreach ($this->attr as $k => $v) {
316
+ $string .= "[$k]=>\"" . $this->$k . '", ';
 
317
  }
318
  $string .= ')';
319
  }
320
+
321
+ if (count($this->_) > 0) {
322
  $string .= ' $_ (';
323
+ foreach ($this->_ as $k => $v) {
324
+ if (is_array($v)) {
 
 
325
  $string .= "[$k]=>(";
326
+ foreach ($v as $k2 => $v2) {
327
+ $string .= "[$k2]=>\"" . $v2 . '", ';
 
328
  }
329
+ $string .= ')';
330
  } else {
331
+ $string .= "[$k]=>\"" . $v . '", ';
332
  }
333
  }
334
+ $string .= ')';
335
  }
336
 
337
+ if (isset($this->text)) {
338
+ $string .= ' text: (' . $this->text . ')';
 
339
  }
340
 
341
  $string .= " HDOM_INNER_INFO: '";
342
+
343
+ if (isset($node->_[HDOM_INFO_INNER])) {
344
  $string .= $node->_[HDOM_INFO_INNER] . "'";
345
+ } else {
 
 
346
  $string .= ' NULL ';
347
  }
348
 
349
+ $string .= ' children: ' . count($this->children);
350
+ $string .= ' nodes: ' . count($this->nodes);
351
+ $string .= ' tag_start: ' . $this->tag_start;
352
  $string .= "\n";
353
 
354
+ if ($echo) {
 
355
  echo $string;
356
  return;
357
+ } else {
 
 
358
  return $string;
359
  }
360
  }
366
  * the current parent node.
367
  * @return object|null The parent node
368
  */
369
+ function parent($parent = null)
370
  {
371
  // I am SURE that this doesn't work properly.
372
+ // It fails to unset the current node from it's current parents nodes or
373
+ // children list first.
374
+ if ($parent !== null) {
375
  $this->parent = $parent;
376
  $this->parent->nodes[] = $this;
377
  $this->parent->children[] = $this;
396
  * @return object|array|null The child node at the specified index, all child
397
  * nodes or null if the index is invalid.
398
  */
399
+ function children($idx = -1)
400
  {
401
+ if ($idx === -1) {
 
402
  return $this->children;
403
  }
404
+
405
+ if (isset($this->children[$idx])) {
406
  return $this->children[$idx];
407
  }
408
+
409
  return null;
410
  }
411
 
420
  */
421
  function first_child()
422
  {
423
+ if (count($this->children) > 0) {
 
424
  return $this->children[0];
425
  }
426
  return null;
436
  */
437
  function last_child()
438
  {
439
+ if (($count = count($this->children)) > 0) {
440
+ return $this->children[$count - 1];
 
441
  }
442
  return null;
443
  }
450
  */
451
  function next_sibling()
452
  {
453
+ if ($this->parent === null) {
 
454
  return null;
455
  }
456
 
457
  $idx = 0;
458
  $count = count($this->parent->children);
459
+
460
+ while ($idx < $count && $this !== $this->parent->children[$idx]) {
461
  ++$idx;
462
  }
463
+
464
+ if (++$idx >= $count) {
465
  return null;
466
  }
467
+
468
  return $this->parent->children[$idx];
469
  }
470
 
476
  */
477
  function prev_sibling()
478
  {
479
+ if ($this->parent === null) { return null; }
480
+
481
  $idx = 0;
482
  $count = count($this->parent->children);
483
+
484
+ while ($idx < $count && $this !== $this->parent->children[$idx]) {
485
  ++$idx;
486
+ }
487
+
488
+ if (--$idx < 0) { return null; }
489
+
490
  return $this->parent->children[$idx];
491
  }
492
 
508
  // Start by including ourselves in the comparison.
509
  $returnDom = $this;
510
 
511
+ while (!is_null($returnDom)) {
512
+ if (is_object($debug_object)) {
513
+ $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
514
+ }
515
 
516
+ if ($returnDom->tag == $tag) {
 
517
  break;
518
  }
519
+
520
  $returnDom = $returnDom->parent;
521
  }
522
+
523
  return $returnDom;
524
  }
525
 
530
  */
531
  function innertext()
532
  {
533
+ if (isset($this->_[HDOM_INFO_INNER])) {
534
+ return $this->_[HDOM_INFO_INNER];
535
+ }
536
+
537
+ if (isset($this->_[HDOM_INFO_TEXT])) {
538
+ return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
539
+ }
540
 
541
  $ret = '';
542
+
543
+ foreach ($this->nodes as $n) {
544
  $ret .= $n->outertext();
545
+ }
546
+
547
  return $ret;
548
  }
549
 
555
  function outertext()
556
  {
557
  global $debug_object;
558
+
559
+ if (is_object($debug_object)) {
560
  $text = '';
561
+
562
+ if ($this->tag === 'text') {
563
+ if (!empty($this->text)) {
564
+ $text = ' with text: ' . $this->text;
 
565
  }
566
  }
567
+
568
  $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
569
  }
570
 
571
+ if ($this->tag === 'root') return $this->innertext();
572
 
573
  // trigger callback
574
+ if ($this->dom && $this->dom->callback !== null) {
 
575
  call_user_func_array($this->dom->callback, array($this));
576
  }
577
 
578
+ if (isset($this->_[HDOM_INFO_OUTER])) {
579
+ return $this->_[HDOM_INFO_OUTER];
580
+ }
581
+
582
+ if (isset($this->_[HDOM_INFO_TEXT])) {
583
+ return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
584
+ }
585
 
586
  // render begin tag
587
+ if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
 
588
  $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
589
  } else {
590
+ $ret = '';
591
  }
592
 
593
  // render inner text
594
+ if (isset($this->_[HDOM_INFO_INNER])) {
595
+ // If it's a br tag... don't return the HDOM_INNER_INFO that we
596
+ // may or may not have added.
597
+ if ($this->tag !== 'br') {
 
598
  $ret .= $this->_[HDOM_INFO_INNER];
599
  }
600
  } else {
601
+ if ($this->nodes) {
602
+ foreach ($this->nodes as $n) {
 
 
603
  $ret .= $this->convert_text($n->outertext());
604
  }
605
  }
606
  }
607
 
608
  // render end tag
609
+ if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
610
+ $ret .= '</' . $this->tag . '>';
611
+ }
612
+
613
  return $ret;
614
  }
615
 
620
  */
621
  function text()
622
  {
623
+ if (isset($this->_[HDOM_INFO_INNER])) {
624
+ return $this->_[HDOM_INFO_INNER];
625
+ }
626
+
627
+ switch ($this->nodetype) {
628
  case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
629
  case HDOM_TYPE_COMMENT: return '';
630
  case HDOM_TYPE_UNKNOWN: return '';
631
  }
632
+
633
+ if (strcasecmp($this->tag, 'script') === 0) { return ''; }
634
+ if (strcasecmp($this->tag, 'style') === 0) { return ''; }
635
 
636
  $ret = '';
637
+
638
+ // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
639
+ // for some span tags, and some p tags) $this->nodes is set to NULL.
640
+ // NOTE: This indicates that there is a problem where it's set to NULL
641
+ // without a clear happening.
642
  // WHY is this happening?
643
+ if (!is_null($this->nodes)) {
644
+ foreach ($this->nodes as $n) {
 
 
645
  // Start paragraph after a blank line
646
+ if ($n->tag === 'p') {
 
647
  $ret .= "\n\n";
648
  }
649
 
650
  $ret .= $this->convert_text($n->text());
651
 
652
+ // If this node is a span... add a space at the end of it so
653
+ // multiple spans don't run into each other. This is plaintext
654
+ // after all.
655
+ if ($n->tag === 'span') {
656
  $ret .= $this->dom->default_span_text;
657
  }
658
  }
677
  function makeup()
678
  {
679
  // text, comment, unknown
680
+ if (isset($this->_[HDOM_INFO_TEXT])) {
681
+ return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
682
+ }
683
 
684
+ $ret = '<' . $this->tag;
685
  $i = -1;
686
 
687
+ foreach ($this->attr as $key => $val) {
 
688
  ++$i;
689
 
690
  // skip removed attribute
691
+ if ($val === null || $val === false) { continue; }
 
692
 
693
  $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
694
+
695
  //no value attr: nowrap, checked selected...
696
+ if ($val === true) {
697
  $ret .= $key;
698
+ } else {
699
  switch ($this->_[HDOM_INFO_QUOTE][$i])
700
  {
701
  case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
702
  case HDOM_QUOTE_SINGLE: $quote = '\''; break;
703
  default: $quote = '';
704
  }
705
+
706
+ $ret .= $key
707
+ . $this->_[HDOM_INFO_SPACE][$i][1]
708
+ . '='
709
+ . $this->_[HDOM_INFO_SPACE][$i][2]
710
+ . $quote
711
+ . $val
712
+ . $quote;
713
  }
714
  }
715
+
716
  $ret = $this->dom->restore_noise($ret);
717
  return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
718
  }
719
 
720
+ /**
721
+ * Find elements by CSS selector
722
+ *
723
+ * @param string $selector The CSS selector
724
+ * @param int|null $idx Index of element to return form the list of matching
725
+ * elements (default: `null` = disabled).
726
+ * @param bool $lowercase Matches tag names case insensitive (lowercase) if
727
+ * enabled (default: `false`)
728
+ * @return array|object|null A list of elements matching the specified CSS
729
+ * selector or a single element if $idx is specified or null if no element
730
+ * was found.
731
+ */
732
+ function find($selector, $idx = null, $lowercase = false)
733
  {
734
  $selectors = $this->parse_selector($selector);
735
+ if (($count = count($selectors)) === 0) { return array(); }
736
  $found_keys = array();
737
 
738
  // find each selector
739
+ for ($c = 0; $c < $count; ++$c) {
740
+ // The change on the below line was documented on the sourceforge
741
+ // code tracker id 2788009
742
  // used to be: if (($levle=count($selectors[0]))===0) return array();
743
+ if (($levle = count($selectors[$c])) === 0) { return array(); }
744
+ if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
745
 
746
+ $head = array($this->_[HDOM_INFO_BEGIN] => 1);
747
+ $cmd = ' '; // Combinator
748
 
749
  // handle descendant selectors, no recursive!
750
+ for ($l = 0; $l < $levle; ++$l) {
 
751
  $ret = array();
752
+
753
+ foreach ($head as $k => $v) {
754
+ $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
755
  //PaperG - Pass this optional parameter on to the seek function.
756
+ $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
757
  }
758
+
759
  $head = $ret;
760
+ $cmd = $selectors[$c][$l][4]; // Next Combinator
761
  }
762
 
763
+ foreach ($head as $k => $v) {
764
+ if (!isset($found_keys[$k])) {
 
 
765
  $found_keys[$k] = 1;
766
  }
767
  }
771
  ksort($found_keys);
772
 
773
  $found = array();
774
+ foreach ($found_keys as $k => $v) {
775
  $found[] = $this->dom->nodes[$k];
776
+ }
777
 
778
  // return nth-element or array
779
+ if (is_null($idx)) { return $found; }
780
+ elseif ($idx < 0) { $idx = count($found) + $idx; }
781
  return (isset($found[$idx])) ? $found[$idx] : null;
782
  }
783
 
784
+ /**
785
+ * Seek DOM elements by selector
786
+ *
787
+ * **Note**
788
+ * The selector element must be compatible to a selector from
789
+ * {@see simple_html_dom_node::parse_selector()}
790
+ *
791
+ * @param array $selector A selector element
792
+ * @param array $ret An array of matches
793
+ * @param bool $lowercase Matches tag names case insensitive (lowercase) if
794
+ * enabled (default: `false`)
795
+ * @return void
796
+ */
797
+ protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
798
  {
799
  global $debug_object;
800
  if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
801
 
802
+ list($tag, $id, $class, $attributes, $cmb) = $selector;
803
+ $nodes = array();
804
+
805
+ if ($parent_cmd === ' ') { // Descendant Combinator
806
+ // Find parent closing tag if the current element doesn't have a closing
807
+ // tag (i.e. void element)
808
+ $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
809
+ if ($end == 0) {
810
+ $parent = $this->parent;
811
+ while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
812
+ $end -= 1;
813
+ $parent = $parent->parent;
 
814
  }
815
+ $end += $parent->_[HDOM_INFO_END];
816
  }
 
 
817
 
818
+ // Get list of target nodes
819
+ $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
820
+ $nodes_count = $end - $nodes_start;
821
+ $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
822
+ } elseif ($parent_cmd === '>') { // Child Combinator
823
+ $nodes = $this->children;
824
+ } elseif ($parent_cmd === '+'
825
+ && $this->parent
826
+ && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
827
+ $index = array_search($this, $this->parent->children, true) + 1;
828
+ $nodes[] = $this->parent->children[$index];
829
+ } elseif ($parent_cmd === '~'
830
+ && $this->parent
831
+ && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
832
+ $index = array_search($this, $this->parent->children, true);
833
+ $nodes = array_slice($this->parent->children, $index);
834
+ }
835
+
836
+ // Go throgh each element starting at this element until the end tag
837
+ // Note: If this element is a void tag, any previous void element is
838
+ // skipped.
839
+ foreach($nodes as $node) {
840
+ $pass = true;
841
+
842
+ // Skip root nodes
843
+ if(!$node->parent) {
844
+ $pass = false;
845
  }
 
 
846
 
847
+ // Skip if node isn't a child node (i.e. text nodes)
848
+ if($pass && !in_array($node, $node->parent->children, true)) {
849
+ $pass = false;
850
+ }
851
 
852
+ // Skip if tag doesn't match
853
+ if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
854
+ $pass = false;
855
+ }
856
 
857
+ // Skip if ID doesn't exist
858
+ if ($pass && $id !== '' && !isset($node->attr['id'])) {
859
+ $pass = false;
 
860
  }
861
 
862
+ // Check if ID matches
863
+ if ($pass && $id !== '' && isset($node->attr['id'])) {
864
+ // Note: Only consider the first ID (as browsers do)
865
+ $node_id = explode(' ', trim($node->attr['id']))[0];
866
+
867
+ if($id !== $node_id) { $pass = false; }
 
 
 
868
  }
 
 
 
 
 
 
 
 
 
 
 
869
 
870
+ // Check if all class(es) exist
871
+ if ($pass && $class !== '' && is_array($class) && !empty($class)) {
872
+ if (isset($node->attr['class'])) {
873
+ $node_classes = explode(' ', $node->attr['class']);
874
+
875
+ if ($lowercase) {
876
+ $node_classes = array_map('strtolower', $node_classes);
877
+ }
878
+
879
+ foreach($class as $c) {
880
+ if(!in_array($c, $node_classes)) {
881
+ $pass = false;
882
+ break;
883
+ }
884
+ }
885
  } else {
886
+ $pass = false;
887
  }
888
+ }
889
+
890
+ // Check attributes
891
+ if ($pass
892
+ && $attributes !== ''
893
+ && is_array($attributes)
894
+ && !empty($attributes)) {
895
+ foreach($attributes as $a) {
896
+ list (
897
+ $att_name,
898
+ $att_expr,
899
+ $att_val,
900
+ $att_inv,
901
+ $att_case_sensitivity
902
+ ) = $a;
903
+
904
+ // Handle indexing attributes (i.e. "[2]")
905
+ /**
906
+ * Note: This is not supported by the CSS Standard but adds
907
+ * the ability to select items compatible to XPath (i.e.
908
+ * the 3rd element within it's parent).
909
+ *
910
+ * Note: This doesn't conflict with the CSS Standard which
911
+ * doesn't work on numeric attributes anyway.
912
+ */
913
+ if (is_numeric($att_name)
914
+ && $att_expr === ''
915
+ && $att_val === '') {
916
+ $count = 0;
917
+
918
+ // Find index of current element in parent
919
+ foreach ($node->parent->children as $c) {
920
+ if ($c->tag === $node->tag) ++$count;
921
+ if ($c === $node) break;
922
+ }
923
+
924
+ // If this is the correct node, continue with next
925
+ // attribute
926
+ if ($count === (int)$att_name) continue;
927
+ }
928
+
929
+ // Check attribute availability
930
+ if ($att_inv) { // Attribute should NOT be set
931
+ if (isset($node->attr[$att_name])) {
932
+ $pass = false;
933
+ break;
934
  }
935
+ } else { // Attribute should be set
936
+ // todo: "plaintext" is not a valid CSS selector!
937
+ if ($att_name !== 'plaintext'
938
+ && !isset($node->attr[$att_name])) {
939
+ $pass = false;
940
+ break;
941
+ }
942
+ }
943
+
944
+ // Continue with next attribute if expression isn't defined
945
+ if ($att_expr === '') continue;
946
+
947
+ // If they have told us that this is a "plaintext"
948
+ // search then we want the plaintext of the node - right?
949
+ // todo "plaintext" is not a valid CSS selector!
950
+ if ($att_name === 'plaintext') {
951
+ $nodeKeyValue = $node->text();
952
+ } else {
953
+ $nodeKeyValue = $node->attr[$att_name];
954
+ }
955
+
956
+ if (is_object($debug_object)) {
957
+ $debug_object->debug_log(2,
958
+ 'testing node: '
959
+ . $node->tag
960
+ . ' for attribute: '
961
+ . $att_name
962
+ . $att_expr
963
+ . $att_val
964
+ . ' where nodes value is: '
965
+ . $nodeKeyValue
966
+ );
967
+ }
968
+
969
+ // If lowercase is set, do a case insensitive test of
970
+ // the value of the selector.
971
+ if ($lowercase) {
972
+ $check = $this->match(
973
+ $att_expr,
974
+ strtolower($att_val),
975
+ strtolower($nodeKeyValue),
976
+ $att_case_sensitivity
977
+ );
978
+ } else {
979
+ $check = $this->match(
980
+ $att_expr,
981
+ $att_val,
982
+ $nodeKeyValue,
983
+ $att_case_sensitivity
984
+ );
985
+ }
986
+
987
+ if (is_object($debug_object)) {
988
+ $debug_object->debug_log(2,
989
+ 'after match: '
990
+ . ($check ? 'true' : 'false')
991
+ );
992
+ }
993
+
994
+ if (!$check) {
995
+ $pass = false;
996
+ break;
997
  }
998
  }
 
 
999
  }
1000
+
1001
+ // Found a match. Add to list and clear node
1002
+ if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
1003
  unset($node);
1004
  }
1005
  // It's passed by reference so this is actually what this function returns.
1006
+ if (is_object($debug_object)) {
1007
+ $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
1008
+ }
1009
  }
1010
 
1011
+ /**
1012
+ * Match value and pattern for a given CSS expression
1013
+ *
1014
+ * **Supported Expressions**
1015
+ *
1016
+ * | Expression | Description
1017
+ * | ---------- | -----------
1018
+ * | `=` | $value and $pattern must be equal
1019
+ * | `!=` | $value and $pattern must not be equal
1020
+ * | `^=` | $value must start with $pattern
1021
+ * | `$=` | $value must end with $pattern
1022
+ * | `*=` | $value must contain $pattern
1023
+ *
1024
+ * @param string $exp The expression.
1025
+ * @param string $pattern The pattern
1026
+ * @param string $value The value
1027
+ * @value bool True if $value matches $pattern
1028
+ */
1029
+ protected function match($exp, $pattern, $value, $case_sensitivity)
1030
+ {
1031
  global $debug_object;
1032
  if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
1033
 
1034
+ if ($case_sensitivity === 'i') {
1035
+ $pattern = strtolower($pattern);
1036
+ $value = strtolower($value);
1037
+ }
1038
+
1039
  switch ($exp) {
1040
  case '=':
1041
+ return ($value === $pattern);
1042
  case '!=':
1043
+ return ($value !== $pattern);
1044
  case '^=':
1045
+ return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
1046
  case '$=':
1047
+ return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
1048
  case '*=':
1049
+ return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
1050
+ case '|=':
1051
+ /**
1052
+ * [att|=val]
1053
+ *
1054
+ * Represents an element with the att attribute, its value
1055
+ * either being exactly "val" or beginning with "val"
1056
+ * immediately followed by "-" (U+002D).
1057
+ */
1058
+ return strpos($value, $pattern) === 0;
1059
+ case '~=':
1060
+ /**
1061
+ * [att~=val]
1062
+ *
1063
+ * Represents an element with the att attribute whose value is a
1064
+ * whitespace-separated list of words, one of which is exactly
1065
+ * "val". If "val" contains whitespace, it will never represent
1066
+ * anything (since the words are separated by spaces). Also if
1067
+ * "val" is the empty string, it will never represent anything.
1068
+ */
1069
+ return in_array($pattern, explode(' ', trim($value)), true);
1070
  }
1071
  return false;
1072
  }
1073
 
1074
+ /**
1075
+ * Parse CSS selector
1076
+ *
1077
+ * @param string $selector_string CSS selector string
1078
+ * @return array List of CSS selectors. The format depends on the type of
1079
+ * selector:
1080
+ *
1081
+ * ```php
1082
+ *
1083
+ * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
1084
+ * array( // list of combinator selectors, i.e. 'img > p > div'
1085
+ * array( // selector element
1086
+ * [0], // (string) The element tag
1087
+ * [1], // (string) The element id
1088
+ * [2], // (array<string>) The element classes
1089
+ * [3], // (array<array<string>>) The list of attributes, each
1090
+ * // with four elements: name, expression, value, inverted
1091
+ * [4] // (string) The selector combinator (' ' | '>' | '+' | '~')
1092
+ * )
1093
+ * )
1094
+ * )
1095
+ * ```
1096
+ *
1097
+ * @link https://www.w3.org/TR/selectors/#compound Compound selector
1098
+ */
1099
+ protected function parse_selector($selector_string)
1100
+ {
1101
  global $debug_object;
1102
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1103
 
1104
+ /**
1105
+ * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
1106
+ *
1107
+ * Paperg: Add the colon to the attribute, so that it properly finds
1108
+ * <tag attr:ibute="something" > like google does.
1109
+ *
1110
+ * Note: if you try to look at this attribute, you MUST use getAttribute
1111
+ * since $dom->x:y will fail the php syntax check.
1112
+ *
1113
+ * Notice the \[ starting the attribute? and the @? following? This
1114
+ * implies that an attribute can begin with an @ sign that is not
1115
+ * captured. This implies that an html attribute specifier may start
1116
+ * with an @ sign that is NOT captured by the expression. Farther study
1117
+ * is required to determine of this should be documented or removed.
1118
+ *
1119
+ * Matches selectors in this order:
1120
+ *
1121
+ * [0] - full match
1122
+ *
1123
+ * [1] - tag name
1124
+ * ([\w:\*-]*)
1125
+ * Matches the tag name consisting of zero or more words, colons,
1126
+ * asterisks and hyphens.
1127
+ *
1128
+ * [2] - id name
1129
+ * (?:\#([\w-]+))
1130
+ * Optionally matches a id name, consisting of an "#" followed by
1131
+ * the id name (one or more words and hyphens).
1132
+ *
1133
+ * [3] - class names (including dots)
1134
+ * (?:\.([\w\.-]+))?
1135
+ * Optionally matches a list of classs, consisting of an "."
1136
+ * followed by the class name (one or more words and hyphens)
1137
+ * where multiple classes can be chained (i.e. ".foo.bar.baz")
1138
+ *
1139
+ * [4] - attributes
1140
+ * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
1141
+ * Optionally matches the attributes list
1142
+ *
1143
+ * [5] - separator
1144
+ * ([\/, >+~]+)
1145
+ * Matches the selector list separator
1146
+ */
1147
+ // phpcs:ignore Generic.Files.LineLength
1148
+ $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
1149
+
1150
+ preg_match_all(
1151
+ $pattern,
1152
+ trim($selector_string) . ' ', // Add final ' ' as pseudo separator
1153
+ $matches,
1154
+ PREG_SET_ORDER
1155
+ );
1156
+
1157
+ if (is_object($debug_object)) {
1158
+ $debug_object->debug_log(2, 'Matches Array: ', $matches);
1159
+ }
1160
 
1161
  $selectors = array();
1162
  $result = array();
 
1163
 
1164
  foreach ($matches as $m) {
1165
  $m[0] = trim($m[0]);
1166
+
1167
+ // Skip NoOps
1168
+ if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
1169
+
1170
+ // Convert to lowercase
1171
+ if ($this->dom->lowercase) {
1172
+ $m[1] = strtolower($m[1]);
1173
+ }
1174
+
1175
+ // Extract classes
1176
+ if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
1177
+
1178
+ /* Extract attributes (pattern based on the pattern above!)
1179
+
1180
+ * [0] - full match
1181
+ * [1] - attribute name
1182
+ * [2] - attribute expression
1183
+ * [3] - attribute value
1184
+ * [4] - case sensitivity
1185
+ *
1186
+ * Note: Attributes can be negated with a "!" prefix to their name
1187
+ */
1188
+ if($m[4] !== '') {
1189
+ preg_match_all(
1190
+ "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
1191
+ trim($m[4]),
1192
+ $attributes,
1193
+ PREG_SET_ORDER
1194
+ );
1195
+
1196
+ // Replace element by array
1197
+ $m[4] = array();
1198
+
1199
+ foreach($attributes as $att) {
1200
+ // Skip empty matches
1201
+ if(trim($att[0]) === '') { continue; }
1202
+
1203
+ $inverted = (isset($att[1][0]) && $att[1][0] === '!');
1204
+ $m[4][] = array(
1205
+ $inverted ? substr($att[1], 1) : $att[1], // Name
1206
+ (isset($att[2])) ? $att[2] : '', // Expression
1207
+ (isset($att[3])) ? $att[3] : '', // Value
1208
+ $inverted, // Inverted Flag
1209
+ (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
1210
+ );
1211
+ }
1212
+ }
1213
+
1214
+ // Sanitize Separator
1215
+ if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
1216
+ $m[5] = ' ';
1217
+ } else { // Other Separator
1218
+ $m[5] = trim($m[5]);
1219
+ }
1220
+
1221
+ // Clear Separator if it's a Selector List
1222
+ if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
1223
+
1224
+ // Remove full match before adding to results
1225
+ array_shift($m);
1226
+ $result[] = $m;
1227
+
1228
+ if ($is_list) { // Selector List
1229
  $selectors[] = $result;
1230
  $result = array();
1231
  }
1232
  }
1233
+
1234
+ if (count($result) > 0) { $selectors[] = $result; }
1235
  return $selectors;
1236
  }
1237
 
1238
  function __get($name)
1239
  {
1240
+ if (isset($this->attr[$name])) {
 
1241
  return $this->convert_text($this->attr[$name]);
1242
  }
1243
+ switch ($name) {
 
1244
  case 'outertext': return $this->outertext();
1245
  case 'innertext': return $this->innertext();
1246
  case 'plaintext': return $this->text();
1252
  function __set($name, $value)
1253
  {
1254
  global $debug_object;
1255
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1256
 
1257
+ switch ($name) {
 
1258
  case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
1259
  case 'innertext':
1260
+ if (isset($this->_[HDOM_INFO_TEXT])) {
1261
+ return $this->_[HDOM_INFO_TEXT] = $value;
1262
+ }
1263
  return $this->_[HDOM_INFO_INNER] = $value;
1264
  }
1265
+
1266
+ if (!isset($this->attr[$name])) {
1267
  $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1268
  $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1269
  }
1270
+
1271
  $this->attr[$name] = $value;
1272
  }
1273
 
1274
  function __isset($name)
1275
  {
1276
+ switch ($name) {
 
1277
  case 'outertext': return true;
1278
  case 'innertext': return true;
1279
  case 'plaintext': return true;
1282
  return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1283
  }
1284
 
1285
+ function __unset($name)
1286
+ {
1287
+ if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1288
  }
1289
 
1290
+ // PaperG - Function to convert the text from one character set to another
1291
+ // if the two sets are not the same.
1292
  function convert_text($text)
1293
  {
1294
  global $debug_object;
1295
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1296
 
1297
  $converted_text = $text;
1298
 
1299
+ $sourceCharset = '';
1300
+ $targetCharset = '';
1301
 
1302
+ if ($this->dom) {
 
1303
  $sourceCharset = strtoupper($this->dom->_charset);
1304
  $targetCharset = strtoupper($this->dom->_target_charset);
1305
  }
 
1306
 
1307
+ if (is_object($debug_object)) {
1308
+ $debug_object->debug_log(3,
1309
+ 'source charset: '
1310
+ . $sourceCharset
1311
+ . ' target charaset: '
1312
+ . $targetCharset
1313
+ );
1314
+ }
1315
+
1316
+ if (!empty($sourceCharset)
1317
+ && !empty($targetCharset)
1318
+ && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1319
  // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1320
+ if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1321
+ && ($this->is_utf8($text))) {
1322
  $converted_text = $text;
1323
+ } else {
 
 
1324
  $converted_text = iconv($sourceCharset, $targetCharset, $text);
1325
  }
1326
  }
1327
 
1328
  // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1329
+ if ($targetCharset === 'UTF-8') {
1330
+ if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
 
 
1331
  $converted_text = substr($converted_text, 3);
1332
  }
1333
+
1334
+ if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1335
  $converted_text = substr($converted_text, 0, -3);
1336
  }
1337
  }
1347
  */
1348
  static function is_utf8($str)
1349
  {
1350
+ $c = 0; $b = 0;
1351
+ $bits = 0;
1352
+ $len = strlen($str);
1353
+ for($i = 0; $i < $len; $i++) {
1354
+ $c = ord($str[$i]);
1355
+ if($c > 128) {
1356
+ if(($c >= 254)) { return false; }
1357
+ elseif($c >= 252) { $bits = 6; }
1358
+ elseif($c >= 248) { $bits = 5; }
1359
+ elseif($c >= 240) { $bits = 4; }
1360
+ elseif($c >= 224) { $bits = 3; }
1361
+ elseif($c >= 192) { $bits = 2; }
1362
+ else { return false; }
1363
+ if(($i + $bits) > $len) { return false; }
1364
+ while($bits > 1) {
 
 
 
1365
  $i++;
1366
+ $b = ord($str[$i]);
1367
+ if($b < 128 || $b > 191) { return false; }
1368
  $bits--;
1369
  }
1370
  }
1371
  }
1372
  return true;
1373
  }
 
 
 
 
 
 
 
1374
 
1375
  /**
1376
+ * Function to try a few tricks to determine the displayed size of an img on
1377
+ * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
1378
+ * other tag types.
1379
  *
1380
  * @author John Schlick
1381
  * @version April 19 2012
1382
+ * @return array an array containing the 'height' and 'width' of the image
1383
+ * on the page or -1 if we can't figure it out.
1384
  */
1385
  function get_display_size()
1386
  {
1389
  $width = -1;
1390
  $height = -1;
1391
 
1392
+ if ($this->tag !== 'img') {
 
1393
  return false;
1394
  }
1395
 
1396
  // See if there is aheight or width attribute in the tag itself.
1397
+ if (isset($this->attr['width'])) {
 
1398
  $width = $this->attr['width'];
1399
  }
1400
 
1401
+ if (isset($this->attr['height'])) {
 
1402
  $height = $this->attr['height'];
1403
  }
1404
 
1405
  // Now look for an inline style.
1406
+ if (isset($this->attr['style'])) {
 
1407
  // Thanks to user gnarf from stackoverflow for this regular expression.
1408
  $attributes = array();
1409
+
1410
+ preg_match_all(
1411
+ '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1412
+ $this->attr['style'],
1413
+ $matches,
1414
+ PREG_SET_ORDER
1415
+ );
1416
+
1417
  foreach ($matches as $match) {
1418
+ $attributes[$match[1]] = $match[2];
1419
  }
1420
 
1421
  // If there is a width in the style attributes:
1422
+ if (isset($attributes['width']) && $width == -1) {
 
1423
  // check that the last two characters are px (pixels)
1424
+ if (strtolower(substr($attributes['width'], -2)) === 'px') {
 
1425
  $proposed_width = substr($attributes['width'], 0, -2);
1426
  // Now make sure that it's an integer and not something stupid.
1427
+ if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
 
1428
  $width = $proposed_width;
1429
  }
1430
  }
1431
  }
1432
 
1433
  // If there is a width in the style attributes:
1434
+ if (isset($attributes['height']) && $height == -1) {
 
1435
  // check that the last two characters are px (pixels)
1436
+ if (strtolower(substr($attributes['height'], -2)) == 'px') {
 
1437
  $proposed_height = substr($attributes['height'], 0, -2);
1438
  // Now make sure that it's an integer and not something stupid.
1439
+ if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
 
1440
  $height = $proposed_height;
1441
  }
1442
  }
1445
  }
1446
 
1447
  // Future enhancement:
1448
+ // Look in the tag to see if there is a class or id specified that has
1449
+ // a height or width attribute to it.
1450
 
1451
  // Far future enhancement
1452
+ // Look at all the parent tags of this image to see if they specify a
1453
+ // class or id that has an img selector that specifies a height or width
1454
+ // Note that in this case, the class or id will have the img subselector
1455
+ // for it to apply to the image.
1456
 
1457
  // ridiculously far future development
1458
+ // If the class or id is specified in a SEPARATE css file thats not on
1459
+ // the page, go get it and do what we were just doing for the ones on
1460
+ // the page.
1461
+
1462
+ $result = array(
1463
+ 'height' => $height,
1464
+ 'width' => $width
1465
+ );
1466
 
 
 
1467
  return $result;
1468
  }
1469
 
1470
  // camel naming conventions
1471
+ function getAllAttributes()
1472
+ {
1473
+ return $this->attr;
1474
+ }
1475
+
1476
+ function getAttribute($name)
1477
+ {
1478
+ return $this->__get($name);
1479
+ }
1480
+
1481
+ function setAttribute($name, $value)
1482
+ {
1483
+ $this->__set($name, $value);
1484
+ }
1485
+
1486
+ function hasAttribute($name)
1487
+ {
1488
+ return $this->__isset($name);
1489
+ }
1490
+
1491
+ function removeAttribute($name)
1492
+ {
1493
+ $this->__set($name, null);
1494
+ }
1495
+
1496
+ function getElementById($id)
1497
+ {
1498
+ return $this->find("#$id", 0);
1499
+ }
1500
+
1501
+ function getElementsById($id, $idx = null)
1502
+ {
1503
+ return $this->find("#$id", $idx);
1504
+ }
1505
+
1506
+ function getElementByTagName($name)
1507
+ {
1508
+ return $this->find($name, 0);
1509
+ }
1510
+
1511
+ function getElementsByTagName($name, $idx = null)
1512
+ {
1513
+ return $this->find($name, $idx);
1514
+ }
1515
+
1516
+ function parentNode()
1517
+ {
1518
+ return $this->parent();
1519
+ }
1520
+
1521
+ function childNodes($idx = -1)
1522
+ {
1523
+ return $this->children($idx);
1524
+ }
1525
+
1526
+ function firstChild()
1527
+ {
1528
+ return $this->first_child();
1529
+ }
1530
+
1531
+ function lastChild()
1532
+ {
1533
+ return $this->last_child();
1534
+ }
1535
+
1536
+ function nextSibling()
1537
+ {
1538
+ return $this->next_sibling();
1539
+ }
1540
+
1541
+ function previousSibling()
1542
+ {
1543
+ return $this->prev_sibling();
1544
+ }
1545
+
1546
+ function hasChildNodes()
1547
+ {
1548
+ return $this->has_child();
1549
+ }
1550
+
1551
+ function nodeName()
1552
+ {
1553
+ return $this->tag;
1554
+ }
1555
+
1556
+ function appendChild($node)
1557
+ {
1558
+ $node->parent($this);
1559
+ return $node;
1560
+ }
1561
 
1562
  }
1563
 
1564
  /**
1565
  * simple html dom parser
1566
+ *
1567
+ * Paperg - in the find routine: allow us to specify that we want case
1568
+ * insensitive testing of the value of the selector.
1569
+ *
1570
  * Paperg - change $size from protected to public so we can easily access it
1571
+ *
1572
+ * Paperg - added ForceTagsClosed in the constructor which tells us whether we
1573
+ * trust the html or not. Default is to NOT trust it.
1574
  *
1575
  * @package PlaceLocalInclude
1576
  */
1646
  * Holds the current character at position {@see simple_html_dom::$pos} in
1647
  * the document {@see simple_html_dom::$doc}
1648
  *
1649
+ * _Note_: Using this variable is more efficient than calling
1650
+ * `substr($doc, $pos, 1)`
1651
  *
1652
  * @var string
1653
  */
1694
  */
1695
  protected $token_attr = ' >';
1696
 
1697
+ // Note that this is referenced by a child node, and so it needs to be
1698
+ // public for that node to see this information.
1699
  public $_charset = '';
1700
  public $_target_charset = '';
1701
 
1704
  *
1705
  * @var string
1706
  */
1707
+ protected $default_br_text = '';
1708
 
1709
  /**
1710
  * Suffix for <span> elements
1711
  *
1712
  * @var string
1713
  */
1714
+ public $default_span_text = '';
1715
 
1716
  /**
1717
  * Defines a list of self-closing tags (Void elements) according to the HTML
1726
  * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1727
  */
1728
  protected $self_closing_tags = array(
1729
+ 'area' => 1,
1730
+ 'base' => 1,
1731
+ 'br' => 1,
1732
+ 'col' => 1,
1733
+ 'embed' => 1,
1734
+ 'hr' => 1,
1735
+ 'img' => 1,
1736
+ 'input' => 1,
1737
+ 'link' => 1,
1738
+ 'meta' => 1,
1739
+ 'param' => 1,
1740
+ 'source' => 1,
1741
+ 'track' => 1,
1742
+ 'wbr' => 1
1743
  );
1744
 
1745
  /**
1754
  * - Sort elements by name for better readability!
1755
  */
1756
  protected $block_tags = array(
1757
+ 'body' => 1,
1758
+ 'div' => 1,
1759
+ 'form' => 1,
1760
+ 'root' => 1,
1761
+ 'span' => 1,
1762
+ 'table' => 1
1763
  );
1764
 
1765
  /**
1818
  * the document.
1819
  */
1820
  protected $optional_closing_tags = array(
1821
+ // Not optional, see
1822
+ // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1823
+ 'b' => array('b' => 1),
1824
+ 'dd' => array('dd' => 1, 'dt' => 1),
1825
+ // Not optional, see
1826
+ // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1827
+ 'dl' => array('dd' => 1, 'dt' => 1),
1828
+ 'dt' => array('dd' => 1, 'dt' => 1),
1829
+ 'li' => array('li' => 1),
1830
+ 'optgroup' => array('optgroup' => 1, 'option' => 1),
1831
+ 'option' => array('optgroup' => 1, 'option' => 1),
1832
+ 'p' => array('p' => 1),
1833
+ 'rp' => array('rp' => 1, 'rt' => 1),
1834
+ 'rt' => array('rp' => 1, 'rt' => 1),
1835
+ 'td' => array('td' => 1, 'th' => 1),
1836
+ 'th' => array('td' => 1, 'th' => 1),
1837
+ 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1838
  );
1839
 
1840
+ function __construct(
1841
+ $str = null,
1842
+ $lowercase = true,
1843
+ $forceTagsClosed = true,
1844
+ $target_charset = DEFAULT_TARGET_CHARSET,
1845
+ $stripRN = true,
1846
+ $defaultBRText = DEFAULT_BR_TEXT,
1847
+ $defaultSpanText = DEFAULT_SPAN_TEXT,
1848
+ $options = 0)
1849
  {
1850
+ if ($str) {
1851
+ if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
 
 
1852
  $this->load_file($str);
1853
+ } else {
1854
+ $this->load(
1855
+ $str,
1856
+ $lowercase,
1857
+ $stripRN,
1858
+ $defaultBRText,
1859
+ $defaultSpanText,
1860
+ $options
1861
+ );
1862
  }
1863
  }
1864
+ // Forcing tags to be closed implies that we don't trust the html, but
1865
+ // it can lead to parsing errors if we SHOULD trust the html.
1866
  if (!$forceTagsClosed) {
1867
+ $this->optional_closing_array = array();
1868
  }
1869
+
1870
  $this->_target_charset = $target_charset;
1871
  }
1872
 
1876
  }
1877
 
1878
  // load html from string
1879
+ function load(
1880
+ $str,
1881
+ $lowercase = true,
1882
+ $stripRN = true,
1883
+ $defaultBRText = DEFAULT_BR_TEXT,
1884
+ $defaultSpanText = DEFAULT_SPAN_TEXT,
1885
+ $options = 0)
1886
  {
1887
  global $debug_object;
1888
 
1897
 
1898
  // strip out the \r \n's if we are told to.
1899
  if ($stripRN) {
1900
+ $this->doc = str_replace("\r", ' ', $this->doc);
1901
+ $this->doc = str_replace("\n", ' ', $this->doc);
1902
 
1903
  // set the length of content since we have changed it.
1904
  $this->size = strlen($this->doc);
1928
 
1929
  // make load function chainable
1930
  return $this;
 
1931
  }
1932
 
1933
  // load html from file
1935
  {
1936
  $args = func_get_args();
1937
 
1938
+ if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1939
  $this->load($doc, true);
1940
  } else {
1941
  return false;
1965
  }
1966
 
1967
  // save dom as string
1968
+ function save($filepath = '')
1969
  {
1970
  $ret = $this->root->innertext();
1971
+ if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1972
  return $ret;
1973
  }
1974
 
1975
  // find dom node by css selector
1976
  // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1977
+ function find($selector, $idx = null, $lowercase = false)
1978
  {
1979
  return $this->root->find($selector, $idx, $lowercase);
1980
  }
1982
  // clean up memory due to php5 circular references memory leak...
1983
  function clear()
1984
  {
1985
+ foreach ($this->nodes as $n) {
1986
+ $n->clear(); $n = null;
1987
+ }
1988
+
1989
+ // This add next line is documented in the sourceforge repository.
1990
+ // 2977248 as a fix for ongoing memory leaks that occur even with the
1991
+ // use of clear.
1992
+ if (isset($this->children)) {
1993
+ foreach ($this->children as $n) {
1994
+ $n->clear(); $n = null;
1995
+ }
1996
+ }
1997
+
1998
+ if (isset($this->parent)) {
1999
+ $this->parent->clear();
2000
+ unset($this->parent);
2001
+ }
2002
+
2003
+ if (isset($this->root)) {
2004
+ $this->root->clear();
2005
+ unset($this->root);
2006
+ }
2007
+
2008
  unset($this->doc);
2009
  unset($this->noise);
2010
  }
2011
 
2012
+ function dump($show_attr = true)
2013
  {
2014
  $this->root->dump($show_attr);
2015
  }
2016
 
2017
  // prepare HTML data and init everything
2018
+ protected function prepare(
2019
+ $str, $lowercase = true,
2020
+ $defaultBRText = DEFAULT_BR_TEXT,
2021
+ $defaultSpanText = DEFAULT_SPAN_TEXT)
2022
  {
2023
  $this->clear();
2024
 
2025
  $this->doc = trim($str);
2026
  $this->size = strlen($this->doc);
2027
+ $this->original_size = $this->size; // original size of the html
2028
  $this->pos = 0;
2029
  $this->cursor = 1;
2030
  $this->noise = array();
2037
  $this->root->_[HDOM_INFO_BEGIN] = -1;
2038
  $this->root->nodetype = HDOM_TYPE_ROOT;
2039
  $this->parent = $this->root;
2040
+ if ($this->size > 0) { $this->char = $this->doc[0]; }
2041
  }
2042
 
2043
  /**
2050
  while (true) {
2051
  // Read next tag if there is no text between current position and the
2052
  // next opening tag.
2053
+ if (($s = $this->copy_until_char('<')) === '') {
 
2054
  if($this->read_tag()) {
2055
  continue;
2056
  } else {
2066
  }
2067
  }
2068
 
2069
+ // PAPERG - dkchou - added this to try to identify the character set of the
2070
+ // page we have just parsed so we know better how to spit it out later.
2071
+ // NOTE: IF you provide a routine called
2072
+ // get_last_retrieve_url_contents_content_type which returns the
2073
+ // CURLINFO_CONTENT_TYPE from the last curl_exec
2074
+ // (or the content_type header from the last transfer), we will parse THAT,
2075
+ // and if a charset is specified, we will use it over any other mechanism.
2076
  protected function parse_charset()
2077
  {
2078
  global $debug_object;
2079
 
2080
  $charset = null;
2081
 
2082
+ if (function_exists('get_last_retrieve_url_contents_content_type')) {
 
2083
  $contentTypeHeader = get_last_retrieve_url_contents_content_type();
2084
  $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
2085
+ if ($success) {
 
2086
  $charset = $matches[1];
2087
+ if (is_object($debug_object)) {
2088
+ $debug_object->debug_log(2,
2089
+ 'header content-type found charset of: '
2090
+ . $charset
2091
+ );
2092
+ }
2093
  }
 
2094
  }
2095
 
2096
+ if (empty($charset)) {
2097
+ $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
2098
+
2099
+ if (!empty($el)) {
 
2100
  $fullvalue = $el->content;
2101
+ if (is_object($debug_object)) {
2102
+ $debug_object->debug_log(2,
2103
+ 'meta content-type tag found'
2104
+ . $fullvalue
2105
+ );
2106
+ }
2107
 
2108
+ if (!empty($fullvalue)) {
2109
+ $success = preg_match(
2110
+ '/charset=(.+)/i',
2111
+ $fullvalue,
2112
+ $matches
2113
+ );
2114
+
2115
+ if ($success) {
2116
  $charset = $matches[1];
2117
+ } else {
2118
+ // If there is a meta tag, and they don't specify the
2119
+ // character set, research says that it's typically
2120
+ // ISO-8859-1
2121
+ if (is_object($debug_object)) {
2122
+ $debug_object->debug_log(2,
2123
+ 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
2124
+ );
2125
+ }
2126
+
2127
  $charset = 'ISO-8859-1';
2128
  }
2129
  }
2130
  }
2131
  }
2132
 
2133
+ // If we couldn't find a charset above, then lets try to detect one
2134
+ // based on the text we got...
2135
+ if (empty($charset)) {
2136
+ // Use this in case mb_detect_charset isn't installed/loaded on
2137
+ // this machine.
2138
  $charset = false;
2139
+ if (function_exists('mb_detect_encoding')) {
 
2140
  // Have php try to detect the encoding from the text given to us.
2141
+ $charset = mb_detect_encoding(
2142
+ $this->doc . 'ascii',
2143
+ $encoding_list = array( 'UTF-8', 'CP1252' )
2144
+ );
2145
+
2146
+ if (is_object($debug_object)) {
2147
+ $debug_object->debug_log(2, 'mb_detect found: ' . $charset);
2148
+ }
2149
  }
2150
 
2151
+ // and if this doesn't work... then we need to just wrongheadedly
2152
+ // assume it's UTF-8 so that we can move on - cause this will
2153
+ // usually give us most of what we need...
2154
+ if ($charset === false) {
2155
+ if (is_object($debug_object)) {
2156
+ $debug_object->debug_log(
2157
+ 2,
2158
+ 'since mb_detect failed - using default of utf-8'
2159
+ );
2160
+ }
2161
+
2162
  $charset = 'UTF-8';
2163
  }
2164
  }
2165
 
2166
+ // Since CP1252 is a superset, if we get one of it's subsets, we want
2167
+ // it instead.
2168
+ if ((strtolower($charset) == strtolower('ISO-8859-1'))
2169
+ || (strtolower($charset) == strtolower('Latin1'))
2170
+ || (strtolower($charset) == strtolower('Latin-1'))) {
2171
+
2172
+ if (is_object($debug_object)) {
2173
+ $debug_object->debug_log(
2174
+ 2,
2175
+ 'replacing ' . $charset . ' with CP1252 as its a superset'
2176
+ );
2177
+ }
2178
+
2179
  $charset = 'CP1252';
2180
  }
2181
 
2182
+ if (is_object($debug_object)) {
2183
+ $debug_object->debug_log(1, 'EXIT - ' . $charset);
2184
+ }
2185
 
2186
  return $this->_charset = $charset;
2187
  }
2194
  protected function read_tag()
2195
  {
2196
  // Set end position if no further tags found
2197
+ if ($this->char !== '<') {
 
2198
  $this->root->_[HDOM_INFO_END] = $this->cursor;
2199
  return false;
2200
  }
2201
+
2202
  $begin_tag_pos = $this->pos;
2203
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2204
 
2205
  // end tag
2206
+ if ($this->char === '/') {
2207
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
 
2208
 
2209
  // Skip whitespace in end tags (i.e. in "</ html>")
2210
  $this->skip($this->token_blank);
2211
  $tag = $this->copy_until_char('>');
2212
 
2213
  // Skip attributes in end tags
2214
+ if (($pos = strpos($tag, ' ')) !== false) {
2215
  $tag = substr($tag, 0, $pos);
2216
+ }
2217
 
2218
  $parent_lower = strtolower($this->parent->tag);
2219
  $tag_lower = strtolower($tag);
2220
 
2221
  // The end tag is supposed to close the parent tag. Handle situations
2222
  // when it doesn't
2223
+ if ($parent_lower !== $tag_lower) {
 
2224
  // Parent tag does not have to be closed necessarily (optional closing tag)
2225
  // Current tag is a block tag, so it may close an ancestor
2226
+ if (isset($this->optional_closing_tags[$parent_lower])
2227
+ && isset($this->block_tags[$tag_lower])) {
2228
+
2229
  $this->parent->_[HDOM_INFO_END] = 0;
2230
  $org_parent = $this->parent;
2231
 
2232
  // Traverse ancestors to find a matching opening tag
2233
  // Stop at root node
2234
+ while (($this->parent->parent)
2235
+ && strtolower($this->parent->tag) !== $tag_lower
2236
+ ){
2237
  $this->parent = $this->parent->parent;
2238
+ }
2239
 
2240
  // If we don't have a match add current tag as text node
2241
+ if (strtolower($this->parent->tag) !== $tag_lower) {
2242
  $this->parent = $org_parent; // restore origonal parent
2243
+
2244
+ if ($this->parent->parent) {
2245
+ $this->parent = $this->parent->parent;
2246
+ }
2247
+
2248
  $this->parent->_[HDOM_INFO_END] = $this->cursor;
2249
  return $this->as_text_node($tag);
2250
  }
2251
+ } elseif (($this->parent->parent)
2252
+ && isset($this->block_tags[$tag_lower])
2253
+ ) {
2254
+ // Grandparent exists and current tag is a block tag, so our
2255
+ // parent doesn't have an end tag
2256
  $this->parent->_[HDOM_INFO_END] = 0; // No end tag
2257
  $org_parent = $this->parent;
2258
 
2259
  // Traverse ancestors to find a matching opening tag
2260
  // Stop at root node
2261
+ while (($this->parent->parent)
2262
+ && strtolower($this->parent->tag) !== $tag_lower
2263
+ ) {
2264
  $this->parent = $this->parent->parent;
2265
+ }
2266
 
2267
  // If we don't have a match add current tag as text node
2268
+ if (strtolower($this->parent->tag) !== $tag_lower) {
 
2269
  $this->parent = $org_parent; // restore origonal parent
2270
  $this->parent->_[HDOM_INFO_END] = $this->cursor;
2271
  return $this->as_text_node($tag);
2272
  }
2273
+ } elseif (($this->parent->parent)
2274
+ && strtolower($this->parent->parent->tag) === $tag_lower
2275
+ ) { // Grandparent exists and current tag closes it
 
2276
  $this->parent->_[HDOM_INFO_END] = 0;
2277
  $this->parent = $this->parent->parent;
2278
+ } else { // Random tag, add as text node
 
2279
  return $this->as_text_node($tag);
2280
+ }
2281
  }
2282
 
2283
  // Set end position of parent tag to current cursor position
2284
  $this->parent->_[HDOM_INFO_END] = $this->cursor;
 
2285
 
2286
+ if ($this->parent->parent) {
2287
+ $this->parent = $this->parent->parent;
2288
+ }
2289
+
2290
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2291
  return true;
2292
  }
2293
 
2302
  // <!DOCTYPE html>
2303
  // <![CDATA[ ... ]]>
2304
  // <!-- Comment -->
2305
+ if (isset($tag[0]) && $tag[0] === '!') {
2306
  $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
2307
 
2308
+ if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
2309
  $node->nodetype = HDOM_TYPE_COMMENT;
2310
  $node->tag = 'comment';
2311
  } else { // Could be doctype or CDATA but we don't care
2312
  $node->nodetype = HDOM_TYPE_UNKNOWN;
2313
  $node->tag = 'unknown';
2314
  }
2315
+
2316
+ if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2317
+
2318
  $this->link_nodes($node, true);
2319
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2320
  return true;
2321
  }
2322
 
2323
  // The start tag cannot contain another start tag, if so add as text
2324
  // i.e. "<<html>"
2325
+ if ($pos = strpos($tag, '<') !== false) {
2326
  $tag = '<' . substr($tag, 0, -1);
2327
  $node->_[HDOM_INFO_TEXT] = $tag;
2328
  $this->link_nodes($node, false);
2331
  }
2332
 
2333
  // Handle invalid tag names (i.e. "<html#doc>")
2334
+ if (!preg_match('/^\w[\w:-]*$/', $tag)) {
2335
  $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
2336
 
2337
  // Next char is the beginning of a new tag, don't touch it.
2338
+ if ($this->char === '<') {
2339
  $this->link_nodes($node, false);
2340
  return true;
2341
  }
2342
 
2343
  // Next char closes current tag, add and be done with it.
2344
+ if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2345
  $this->link_nodes($node, false);
2346
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2347
  return true;
2348
  }
2349
 
2353
  $node->tag = ($this->lowercase) ? $tag_lower : $tag;
2354
 
2355
  // handle optional closing tags
2356
+ if (isset($this->optional_closing_tags[$tag_lower])) {
 
2357
  // Traverse ancestors to close all optional closing tags
2358
+ while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
 
2359
  $this->parent->_[HDOM_INFO_END] = 0;
2360
  $this->parent = $this->parent->parent;
2361
  }
2363
  }
2364
 
2365
  $guard = 0; // prevent infinity loop
2366
+
2367
+ // [0] Space between tag and first attribute
2368
+ $space = array($this->copy_skip($this->token_blank), '', '');
2369
 
2370
  // attributes
2371
+ do {
 
2372
  // Everything until the first equal sign should be the attribute name
2373
  $name = $this->copy_until($this->token_equal);
2374
 
2375
+ if ($name === '' && $this->char !== null && $space[0] === '') {
 
2376
  break;
2377
  }
2378
 
2379
+ if ($guard === $this->pos) { // Escape infinite loop
2380
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
 
2381
  continue;
2382
  }
2383
+
2384
  $guard = $this->pos;
2385
 
2386
  // handle endless '<'
2387
+ // Out of bounds before the tag ended
2388
+ if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2389
  $node->nodetype = HDOM_TYPE_TEXT;
2390
  $node->_[HDOM_INFO_END] = 0;
2391
+ $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2392
  $node->tag = 'text';
2393
  $this->link_nodes($node, false);
2394
  return true;
2395
  }
2396
 
2397
  // handle mismatch '<'
2398
+ // Attributes cannot start after opening tag
2399
+ if ($this->doc[$this->pos - 1] == '<') {
2400
  $node->nodetype = HDOM_TYPE_TEXT;
2401
  $node->tag = 'text';
2402
  $node->attr = array();
2403
  $node->_[HDOM_INFO_END] = 0;
2404
+ $node->_[HDOM_INFO_TEXT] = substr(
2405
+ $this->doc,
2406
+ $begin_tag_pos,
2407
+ $this->pos - $begin_tag_pos - 1
2408
+ );
2409
  $this->pos -= 2;
2410
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2411
  $this->link_nodes($node, false);
2412
  return true;
2413
  }
2414
 
2415
+ if ($name !== '/' && $name !== '') { // this is a attribute name
2416
+ // [1] Whitespace after attribute name
2417
+ $space[1] = $this->copy_skip($this->token_blank);
2418
+
2419
  $name = $this->restore_noise($name); // might be a noisy name
2420
+
2421
+ if ($this->lowercase) { $name = strtolower($name); }
2422
+
2423
+ if ($this->char === '=') { // attribute with value
2424
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2425
  $this->parse_attr($node, $name, $space); // get attribute value
2426
+ } else {
 
2427
  //no value attr: nowrap, checked selected...
2428
  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2429
  $node->attr[$name] = true;
2430
+ if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2431
  }
2432
+
2433
  $node->_[HDOM_INFO_SPACE][] = $space;
2434
+
2435
+ // prepare for next attribute
2436
+ $space = array(
2437
+ $this->copy_skip($this->token_blank),
2438
+ '',
2439
+ ''
2440
+ );
2441
+ } else { // no more attributes
2442
  break;
2443
+ }
2444
+ } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2445
 
2446
  $this->link_nodes($node, true);
2447
  $node->_[HDOM_INFO_ENDSPACE] = $space[0];
2448
 
2449
  // handle empty tags (i.e. "<div/>")
2450
+ if ($this->copy_until_char('>') === '/') {
 
2451
  $node->_[HDOM_INFO_ENDSPACE] .= '/';
2452
  $node->_[HDOM_INFO_END] = 0;
2453
+ } else {
 
 
2454
  // reset parent
2455
+ if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2456
+ $this->parent = $node;
2457
+ }
2458
  }
2459
+
2460
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2461
 
2462
  // If it's a BR tag, we need to set it's text to the default text.
2463
  // This way when we see it in plaintext, we can generate formatting that the user wants.
2464
  // since a br tag never has sub nodes, this works well.
2465
+ if ($node->tag === 'br') {
 
2466
  $node->_[HDOM_INFO_INNER] = $this->default_br_text;
2467
  }
2468
 
2480
  protected function parse_attr($node, $name, &$space)
2481
  {
2482
  // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
2483
+ // If the attribute is already defined inside a tag, only pay attention
2484
+ // to the first one as opposed to the last one.
2485
  // https://stackoverflow.com/a/26341866
2486
+ if (isset($node->attr[$name])) {
 
2487
  return;
2488
  }
2489
 
2490
+ // [2] Whitespace between "=" and the value
2491
+ $space[2] = $this->copy_skip($this->token_blank);
2492
+
2493
  switch ($this->char) {
2494
  case '"': // value is anything between double quotes
2495
  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
2496
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2497
  $node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
2498
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2499
  break;
2500
  case '\'': // value is anything between single quotes
2501
  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
2502
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2503
  $node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
2504
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2505
  break;
2506
  default: // value is anything until the first space or end tag
2507
  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2508
  $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
2509
  }
2510
+ // PaperG: Attributes should not have \r or \n in them, that counts as
2511
+ // html whitespace.
2512
+ $node->attr[$name] = str_replace("\r", '', $node->attr[$name]);
2513
+ $node->attr[$name] = str_replace("\n", '', $node->attr[$name]);
2514
+ // PaperG: If this is a "class" selector, lets get rid of the preceeding
2515
+ // and trailing space since some people leave it in the multi class case.
2516
+ if ($name === 'class') {
2517
  $node->attr[$name] = trim($node->attr[$name]);
2518
  }
2519
  }
2530
  {
2531
  $node->parent = $this->parent;
2532
  $this->parent->nodes[] = $node;
2533
+ if ($is_child) {
 
2534
  $this->parent->children[] = $node;
2535
  }
2536
  }
2547
  ++$this->cursor;
2548
  $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2549
  $this->link_nodes($node, false);
2550
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2551
  return true;
2552
  }
2553
 
2562
  protected function skip($chars)
2563
  {
2564
  $this->pos += strspn($this->doc, $chars, $this->pos);
2565
+ $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2566
  }
2567
 
2568
  /**
2578
  $pos = $this->pos;
2579
  $len = strspn($this->doc, $chars, $pos);
2580
  $this->pos += $len;
2581
+ $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2582
+ if ($len === 0) { return ''; }
2583
  return substr($this->doc, $pos, $len);
2584
  }
2585
 
2596
  $pos = $this->pos;
2597
  $len = strcspn($this->doc, $chars, $pos);
2598
  $this->pos += $len;
2599
+ $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2600
  return substr($this->doc, $pos, $len);
2601
  }
2602
 
2610
  */
2611
  protected function copy_until_char($char)
2612
  {
2613
+ if ($this->char === null) { return ''; }
2614
 
2615
+ if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2616
+ $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2617
  $this->char = null;
2618
  $this->pos = $this->size;
2619
  return $ret;
2620
  }
2621
 
2622
+ if ($pos === $this->pos) { return ''; }
2623
+
2624
  $pos_old = $this->pos;
2625
  $this->char = $this->doc[$pos];
2626
  $this->pos = $pos;
2627
+ return substr($this->doc, $pos_old, $pos - $pos_old);
2628
  }
2629
 
2630
  /**
2636
  * @param bool $remove_tag True to remove the entire match. Default is false
2637
  * to only remove the captured data.
2638
  */
2639
+ protected function remove_noise($pattern, $remove_tag = false)
2640
  {
2641
  global $debug_object;
2642
  if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2643
 
2644
+ $count = preg_match_all(
2645
+ $pattern,
2646
+ $this->doc,
2647
+ $matches,
2648
+ PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2649
+ );
2650
+
2651
+ for ($i = $count - 1; $i > -1; --$i) {
2652
+ $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2653
+
2654
+ if (is_object($debug_object)) {
2655
+ $debug_object->debug_log(2, 'key is: ' . $key);
2656
+ }
2657
 
 
 
 
 
2658
  $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2659
  $this->noise[$key] = $matches[$i][$idx][0];
2660
  $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2662
 
2663
  // reset the length of content
2664
  $this->size = strlen($this->doc);
2665
+
2666
+ if ($this->size > 0) {
2667
  $this->char = $this->doc[0];
2668
  }
2669
  }
2681
  global $debug_object;
2682
  if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2683
 
2684
+ while (($pos = strpos($text, '___noise___')) !== false) {
2685
+ // Sometimes there is a broken piece of markup, and we don't GET the
2686
+ // pos+11 etc... token which indicates a problem outside of us...
2687
+
2688
+ // todo: "___noise___1000" (or any number with four or more digits)
2689
+ // in the DOM causes an infinite loop which could be utilized by
2690
+ // malicious software
2691
+ if (strlen($text) > $pos + 15) {
2692
+ $key = '___noise___'
2693
+ . $text[$pos + 11]
2694
+ . $text[$pos + 12]
2695
+ . $text[$pos + 13]
2696
+ . $text[$pos + 14]
2697
+ . $text[$pos + 15];
2698
+
2699
+ if (is_object($debug_object)) {
2700
+ $debug_object->debug_log(2, 'located key of: ' . $key);
2701
  }
2702
+
2703
+ if (isset($this->noise[$key])) {
2704
+ $text = substr($text, 0, $pos)
2705
+ . $this->noise[$key]
2706
+ . substr($text, $pos + 16);
2707
+ } else {
2708
  // do this to prevent an infinite loop.
2709
+ $text = substr($text, 0, $pos)
2710
+ . 'UNDEFINED NOISE FOR KEY: '
2711
+ . $key
2712
+ . substr($text, $pos + 16);
2713
  }
2714
+ } else {
2715
+ // There is no valid key being given back to us... We must get
2716
+ // rid of the ___noise___ or we will have a problem.
2717
+ $text = substr($text, 0, $pos)
2718
+ . 'NO NUMERIC NOISE KEY'
2719
+ . substr($text, $pos + 11);
2720
  }
2721
  }
2722
  return $text;
2728
  global $debug_object;
2729
  if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2730
 
2731
+ foreach($this->noise as $noiseElement) {
2732
+ if (strpos($noiseElement, $text) !== false) {
 
 
2733
  return $noiseElement;
2734
  }
2735
  }
2736
  }
2737
+
2738
  function __toString()
2739
  {
2740
  return $this->root->innertext();
2742
 
2743
  function __get($name)
2744
  {
2745
+ switch ($name) {
 
2746
  case 'outertext':
2747
  return $this->root->innertext();
2748
  case 'innertext':
2757
  }
2758
 
2759
  // camel naming conventions
2760
+ function childNodes($idx = -1)
2761
+ {
2762
+ return $this->root->childNodes($idx);
2763
+ }
2764
+
2765
+ function firstChild()
2766
+ {
2767
+ return $this->root->first_child();
2768
+ }
2769
+
2770
+ function lastChild()
2771
+ {
2772
+ return $this->root->last_child();
2773
+ }
2774
+
2775
+ function createElement($name, $value = null)
2776
+ {
2777
+ return @str_get_html("<$name>$value</$name>")->first_child();
2778
+ }
2779
+
2780
+ function createTextNode($value)
2781
+ {
2782
+ return @end(str_get_html($value)->nodes);
2783
+ }
2784
+
2785
+ function getElementById($id)
2786
+ {
2787
+ return $this->find("#$id", 0);
2788
+ }
2789
+
2790
+ function getElementsById($id, $idx = null)
2791
+ {
2792
+ return $this->find("#$id", $idx);
2793
+ }
2794
 
2795
+ function getElementByTagName($name)
2796
+ {
2797
+ return $this->find($name, 0);
2798
+ }
2799
+
2800
+ function getElementsByTagName($name, $idx = -1)
2801
+ {
2802
+ return $this->find($name, $idx);
2803
+ }
2804
+
2805
+ function loadFile()
2806
+ {
2807
+ $args = func_get_args();
2808
+ $this->load_file($args);
2809
+ }
2810
+ }