External Links - Version 5.0

Version Description

  • Completely replaced the mechanism for parsing links to resolve the various errors that have been occurring with different external services' link attributes
  • Added new option to apply to text widgets in addition to content and comment links. This will now handle 95% of the external link cases and allow for Apply Globally to be left off.
  • Tested with WP 3.7
Download this release

Release Info

Developer Mike_Koepke
Plugin Icon wp plugin External Links
Version 5.0
Comparing to
See all releases

Code changes from version 5.1 to 5.0

anchor-utils/anchor-utils.php CHANGED
@@ -2,7 +2,7 @@
2
  /*
3
  * Anchor Utils
4
  * Author: Denis de Bernardy & Mike Koepke <http://www.semiologic.com>
5
- * Version: 1.3
6
  */
7
 
8
  if ( @ini_get('pcre.backtrack_limit') <= 750000 )
@@ -17,11 +17,10 @@ if ( @ini_get('pcre.recursion_limit') <= 250000 )
17
  **/
18
 
19
  class anchor_utils {
20
-
21
- /**
22
  * anchor_utils
23
  */
24
- public function __construct() {
25
  add_filter('the_content', array($this, 'filter'), 100);
26
  add_filter('the_excerpt', array($this, 'filter'), 100);
27
  add_filter('widget_text', array($this, 'filter'), 100);
@@ -30,7 +29,6 @@ class anchor_utils {
30
  add_action('wp_head', array($this, 'ob_start'), 10000);
31
  } #anchor_utils
32
 
33
-
34
  /**
35
  * ob_start()
36
  *
@@ -50,6 +48,7 @@ class anchor_utils {
50
  }
51
  } # ob_start()
52
 
 
53
  /**
54
  * ob_filter()
55
  *
@@ -184,8 +183,13 @@ class anchor_utils {
184
  **/
185
 
186
  function parse_anchor($match) {
 
 
 
 
 
187
  $anchor = array();
188
- $anchor['attr'] = anchor_utils::parse_attrs($match[1]);
189
 
190
  if ( !is_array($anchor['attr']) || empty($anchor['attr']['href']) # parser error or no link
191
  || trim($anchor['attr']['href']) != esc_url($anchor['attr']['href'], null, 'db') ) # likely a script
@@ -302,90 +306,6 @@ class anchor_utils {
302
 
303
  return str_replace(array_keys($unescape), array_values($unescape), $text);
304
  } # unescape()
305
-
306
- /**
307
- * Parse an attributes string into an array. If the string starts with a tag,
308
- * then the attributes on the first tag are parsed. This parses via a manual
309
- * loop and is designed to be safer than using DOMDocument.
310
- *
311
- * @param string|* $attrs
312
- * @return array
313
- *
314
- * @example parse_attrs( 'src="example.jpg" alt="example"' )
315
- * @example parse_attrs( '<img src="example.jpg" alt="example">' )
316
- * @example parse_attrs( '<a href="example"></a>' )
317
- * @example parse_attrs( '<a href="example">' )
318
- */
319
- function parse_attrs($attrs) {
320
-
321
- if ( !is_scalar($attrs) )
322
- return (array) $attrs;
323
-
324
- $attrs = str_split( trim($attrs) );
325
-
326
- if ( '<' === $attrs[0] ) # looks like a tag so strip the tagname
327
- while ( $attrs && ! ctype_space($attrs[0]) && $attrs[0] !== '>' )
328
- array_shift($attrs);
329
-
330
- $arr = array(); # output
331
- $name = ''; # for the current attr being parsed
332
- $value = ''; # for the current attr being parsed
333
- $mode = 0; # whether current char is part of the name (-), the value (+), or neither (0)
334
- $stop = false; # delimiter for the current $value being parsed
335
- $space = ' '; # a single space
336
-
337
- foreach ( $attrs as $j => $curr ) {
338
-
339
- if ( $mode < 0 ) {# name
340
- if ( '=' === $curr ) {
341
- $mode = 1;
342
- $stop = false;
343
- } elseif ( '>' === $curr ) {
344
- '' === $name or $arr[ $name ] = $value;
345
- break;
346
- } elseif ( !ctype_space($curr) ) {
347
- if ( ctype_space( $attrs[ $j - 1 ] ) ) { # previous char
348
- '' === $name or $arr[ $name ] = ''; # previous name
349
- $name = $curr; # initiate new
350
- } else {
351
- $name .= $curr;
352
- }
353
- }
354
- } elseif ( $mode > 0 ) {# value
355
- if ( $stop === false ) {
356
- if ( !ctype_space($curr) ) {
357
- if ( '"' === $curr || "'" === $curr ) {
358
- $value = '';
359
- $stop = $curr;
360
- } else {
361
- $value = $curr;
362
- $stop = $space;
363
- }
364
- }
365
- } elseif ( $stop === $space ? ctype_space($curr) : $curr === $stop ) {
366
- $arr[ $name ] = $value;
367
- $mode = 0;
368
- $name = $value = '';
369
- } else {
370
- $value .= $curr;
371
- }
372
- } else {# neither
373
-
374
- if ( '>' === $curr )
375
- break;
376
- if ( !ctype_space( $curr ) ) {
377
- # initiate
378
- $name = $curr;
379
- $mode = -1;
380
- }
381
- }
382
- }
383
-
384
- # incl the final pair if it was quoteless
385
- '' === $name or $arr[ $name ] = $value;
386
-
387
- return $arr;
388
- }
389
  } # anchor_utils
390
 
391
  $anchor_utils = new anchor_utils();
2
  /*
3
  * Anchor Utils
4
  * Author: Denis de Bernardy & Mike Koepke <http://www.semiologic.com>
5
+ * Version: 1.2
6
  */
7
 
8
  if ( @ini_get('pcre.backtrack_limit') <= 750000 )
17
  **/
18
 
19
  class anchor_utils {
20
+ /**
 
21
  * anchor_utils
22
  */
23
+ function anchor_utils() {
24
  add_filter('the_content', array($this, 'filter'), 100);
25
  add_filter('the_excerpt', array($this, 'filter'), 100);
26
  add_filter('widget_text', array($this, 'filter'), 100);
29
  add_action('wp_head', array($this, 'ob_start'), 10000);
30
  } #anchor_utils
31
 
 
32
  /**
33
  * ob_start()
34
  *
48
  }
49
  } # ob_start()
50
 
51
+
52
  /**
53
  * ob_filter()
54
  *
183
  **/
184
 
185
  function parse_anchor($match) {
186
+
187
+ // Fix links that have javascript onClick or similar code but uses =" " rather than =' ". Messes up attribute extraction especially for threaded comments
188
+ $on_patterns = '/(onClick|onMouseOver|onMouseOut|onMouseDown|onMouseUp|onDblClick|onContextMenu|onLoad|onAbort|onError)="(.+?)"/iU';
189
+ $match[1] = preg_replace($on_patterns, "$1='$2'", $match[1]);
190
+
191
  $anchor = array();
192
+ $anchor['attr'] = shortcode_parse_atts($match[1]);
193
 
194
  if ( !is_array($anchor['attr']) || empty($anchor['attr']['href']) # parser error or no link
195
  || trim($anchor['attr']['href']) != esc_url($anchor['attr']['href'], null, 'db') ) # likely a script
306
 
307
  return str_replace(array_keys($unescape), array_values($unescape), $text);
308
  } # unescape()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  } # anchor_utils
310
 
311
  $anchor_utils = new anchor_utils();
readme.txt CHANGED
@@ -25,7 +25,7 @@ Under Settings / External Links, you can configure the plugin to:
25
 
26
  The [Semiologic forum](http://forum.semiologic.com) is the best place to report issues. Please note, however, that while community members and I do our best to answer all queries, we're assisting you on a voluntary basis.
27
 
28
- If you require more dedicated assistance, consider using [Semiologic Pro](http://www.semiologic.com).
29
 
30
 
31
  == Installation ==
@@ -36,13 +36,10 @@ If you require more dedicated assistance, consider using [Semiologic Pro](http:/
36
 
37
  == Change Log ==
38
 
39
- = 5.1 =
40
-
41
- - Take two! With issues now with breaking google adsense code reverted back to 4.2 parsing code but added more advanced dom attribute parsing code to handle various link configurations.
42
-
43
  = 5.0 =
44
 
45
  - Completely replaced the mechanism for parsing links to resolve the various errors that have been occurring with different external services' link attributes
 
46
  - Tested with WP 3.7
47
 
48
  = 4.2 =
25
 
26
  The [Semiologic forum](http://forum.semiologic.com) is the best place to report issues. Please note, however, that while community members and I do our best to answer all queries, we're assisting you on a voluntary basis.
27
 
28
+ If you require more dedicated assistance, consider using [Semiologic Pro](http://www.getsemiologic.com).
29
 
30
 
31
  == Installation ==
36
 
37
  == Change Log ==
38
 
 
 
 
 
39
  = 5.0 =
40
 
41
  - Completely replaced the mechanism for parsing links to resolve the various errors that have been occurring with different external services' link attributes
42
+ - Added new option to apply to text widgets in addition to content and comment links. This will now handle 95% of the external link cases and allow for Apply Globally to be left off.
43
  - Tested with WP 3.7
44
 
45
  = 4.2 =
sem-external-links-admin.php CHANGED
@@ -25,10 +25,10 @@ class external_links_admin {
25
 
26
  check_admin_referer('external_links');
27
 
28
- foreach ( array('global', 'icon', 'target', 'nofollow') as $var )
29
  $$var = isset($_POST[$var]);
30
 
31
- update_option('external_links', compact('global', 'icon', 'target', 'nofollow'));
32
 
33
  echo "<div class=\"updated fade\">\n"
34
  . "<p>"
@@ -78,11 +78,26 @@ class external_links_admin {
78
  . checked($options['global'], true, false)
79
  . ' />'
80
  . '&nbsp;'
81
- . __('Apply these settings to all outbound links, including those in headers, footers and sidebars, rather than only those in posts/pages, comments and text widgets.', 'external-links')
82
  . '</label>'
83
  . '</td>' . "\n"
84
  . '</tr>' . "\n";
85
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  echo '<tr>' . "\n"
87
  . '<th scope="row">'
88
  . __('Add Icons', 'external-links')
@@ -113,7 +128,7 @@ class external_links_admin {
113
  . __('Add a rel=nofollow attribute to outbound links.', 'external-links')
114
  . '</label>'
115
  . '<br />' . "\n"
116
- . __('Note: You can override this behavior by adding a rel="follow" to individual links.', 'external-links')
117
  . '</td>' . "\n"
118
  . '</tr>' . "\n";
119
 
25
 
26
  check_admin_referer('external_links');
27
 
28
+ foreach ( array('global', 'icon', 'target', 'nofollow', 'text_widgets') as $var )
29
  $$var = isset($_POST[$var]);
30
 
31
+ update_option('external_links', compact('global', 'icon', 'target', 'nofollow', 'text_widgets'));
32
 
33
  echo "<div class=\"updated fade\">\n"
34
  . "<p>"
78
  . checked($options['global'], true, false)
79
  . ' />'
80
  . '&nbsp;'
81
+ . __('Apply these settings to all outbound links, including those in headers, sidebars and footers, rather than to those in posts and comments.', 'external-links')
82
  . '</label>'
83
  . '</td>' . "\n"
84
  . '</tr>' . "\n";
85
+
86
+ echo '<tr>' . "\n"
87
+ . '<th scope="row">'
88
+ . __('Apply to Text Widgets', 'external-links')
89
+ . '</th>' . "\n"
90
+ . '<td>'
91
+ . '<label>'
92
+ . '<input type="checkbox" name="text_widgets"'
93
+ . checked($options['text_widgets'], true, false)
94
+ . ' />'
95
+ . '&nbsp;'
96
+ . __('Apply these settings to all text widgets in addition to those in posts and comments.', 'external-links')
97
+ . '</label>'
98
+ . '</td>' . "\n"
99
+ . '</tr>' . "\n";
100
+
101
  echo '<tr>' . "\n"
102
  . '<th scope="row">'
103
  . __('Add Icons', 'external-links')
128
  . __('Add a rel=nofollow attribute to outbound links.', 'external-links')
129
  . '</label>'
130
  . '<br />' . "\n"
131
+ . __('Note: You can override this behavior by adding the attribute rel="follow" to individual links.', 'external-links')
132
  . '</td>' . "\n"
133
  . '</tr>' . "\n";
134
 
sem-external-links.php CHANGED
@@ -3,9 +3,9 @@
3
  Plugin Name: External Links
4
  Plugin URI: http://www.semiologic.com/software/external-links/
5
  Description: Marks outbound links as such, with various effects that are configurable under <a href="options-general.php?page=external-links">Settings / External Links</a>.
6
- Version: 5.1
7
  Author: Denis de Bernardy & Mike Koepke
8
- Author URI: http://www.semiologic.com
9
  Text Domain: external-links
10
  Domain Path: /lang
11
  */
@@ -14,7 +14,7 @@ Domain Path: /lang
14
  Terms of use
15
  ------------
16
 
17
- This software is copyright Denis de Bernardy & Mike Koepke, and is distributed under the terms of the GPL license, v2.
18
 
19
  http://www.opensource.org/licenses/gpl-2.0.php
20
  **/
@@ -31,7 +31,7 @@ load_plugin_textdomain('external-links', false, dirname(plugin_basename(__FILE__
31
 
32
  class external_links {
33
 
34
- protected $opts;
35
 
36
  /**
37
  * constructor()
@@ -39,15 +39,24 @@ class external_links {
39
  public function __construct() {
40
  if ( !is_admin() ) {
41
 
42
- if ( !class_exists('anchor_utils') )
43
- include dirname(__FILE__) . '/anchor-utils/anchor-utils.php';
44
 
45
  $o = external_links::get_options();
46
 
47
  if ( $o['icon'] )
48
  add_action('wp_print_styles', array($this, 'styles'), 5);
49
 
50
- add_filter(($o['global'] ? 'ob_' : '' ) . 'filter_anchor', array($this, 'filter'));
 
 
 
 
 
 
 
 
 
51
 
52
  unset($o);
53
  } else {
@@ -68,45 +77,95 @@ class external_links {
68
 
69
 
70
  /**
71
- * filter()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  *
73
- * @param $anchor
74
- * @return string
75
- */
76
 
77
- function filter($anchor) {
78
- # disable in feeds
79
- if ( is_feed() )
80
- return $anchor;
81
 
82
- # ignore local urls
83
- if ( external_links::is_local_url($anchor['attr']['href']) )
84
- return $anchor;
85
 
86
- # no icons for images
87
- $is_image = (bool) preg_match("/^\s*<\s*img\s.+?>\s*$/is", $anchor['body']);
 
 
 
 
88
 
89
- $o = external_links::get_options();
90
 
91
- if ( !in_array('external', $anchor['attr']['class']) )
92
- $anchor['attr']['class'][] = 'external';
93
 
94
- if ( !$is_image && $o['icon'] && !in_array('external_icon', $anchor['attr']['class'])
95
- && !in_array('no_icon', $anchor['attr']['class'])
96
- && !in_array('noicon', $anchor['attr']['class']) )
97
- $anchor['attr']['class'][] = 'external_icon';
98
 
99
- if ( $o['nofollow'] && !function_exists('strip_nofollow')
100
- && !in_array('nofollow', $anchor['attr']['rel'])
101
- && !in_array('follow', $anchor['attr']['rel']) )
102
- $anchor['attr']['rel'][] = 'nofollow';
103
 
104
- if ( $o['target'] && empty($anchor['attr']['target']) )
105
- $anchor['attr']['target'] = '_blank';
 
 
106
 
107
- return $anchor;
108
  } # filter()
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  /**
111
  * is_local_url()
112
  *
@@ -212,7 +271,7 @@ class external_links {
212
 
213
  $o = get_option('external_links');
214
 
215
- if ( $o === false )
216
  $o = external_links::init_options();
217
 
218
  return $o;
@@ -233,6 +292,7 @@ class external_links {
233
  'icon' => true,
234
  'target' => false,
235
  'nofollow' => true,
 
236
  );
237
 
238
  if ( !$o )
3
  Plugin Name: External Links
4
  Plugin URI: http://www.semiologic.com/software/external-links/
5
  Description: Marks outbound links as such, with various effects that are configurable under <a href="options-general.php?page=external-links">Settings / External Links</a>.
6
+ Version: 5.0
7
  Author: Denis de Bernardy & Mike Koepke
8
+ Author URI: http://www.getsemiologic.com
9
  Text Domain: external-links
10
  Domain Path: /lang
11
  */
14
  Terms of use
15
  ------------
16
 
17
+ This software is copyright Denis de Bernardy & Mike Koepke, and is distributed under the terms of the GPL license, v.2.
18
 
19
  http://www.opensource.org/licenses/gpl-2.0.php
20
  **/
31
 
32
  class external_links {
33
 
34
+ private $opts;
35
 
36
  /**
37
  * constructor()
39
  public function __construct() {
40
  if ( !is_admin() ) {
41
 
42
+ if ( !class_exists('simple_html_dom_node') )
43
+ include dirname(__FILE__) . '/simple_html_dom.php';
44
 
45
  $o = external_links::get_options();
46
 
47
  if ( $o['icon'] )
48
  add_action('wp_print_styles', array($this, 'styles'), 5);
49
 
50
+ if ( $o['global'] )
51
+ add_action('wp_head', array($this, 'ob_start'), 10000);
52
+ else {
53
+ add_filter('the_content', array($this, 'filter'), 100);
54
+ add_filter('the_excerpt', array($this, 'filter'), 100);
55
+ add_filter('comment_text', array($this, 'filter'), 100);
56
+
57
+ if ( $o['text_widgets'] )
58
+ add_filter('widget_text', array($this, 'filter'), 100);
59
+ }
60
 
61
  unset($o);
62
  } else {
77
 
78
 
79
  /**
80
+ * ob_start()
81
+ *
82
+ * @return void
83
+ **/
84
+
85
+ function ob_start() {
86
+
87
+ ob_start(array($this, 'filter'));
88
+ add_action('wp_footer', array($this, 'ob_flush'), 10000);
89
+
90
+ } # ob_start()
91
+
92
+
93
+ /**
94
+ * ob_flush()
95
  *
96
+ * @return void
97
+ **/
 
98
 
99
+ static function ob_flush() {
 
 
 
100
 
101
+ ob_end_flush();
102
+ } # ob_flush()
 
103
 
104
+ /**
105
+ * filter()
106
+ *
107
+ * @param string $text
108
+ * @return string $text
109
+ **/
110
 
111
+ function filter($text) {
112
 
113
+ $anchor = array();
 
114
 
115
+ $this->opts = external_links::get_options();
 
 
 
116
 
117
+ $html = new simple_html_dom();
118
+ $html->load( $text );
119
+ foreach( $html->find( 'a, img' ) as $link) {
 
120
 
121
+ $this->apply_attributes( $link );
122
+ }
123
+
124
+ $text = $html->save();
125
 
126
+ return $text;
127
  } # filter()
128
 
129
+ /**
130
+ * apply_attributes()
131
+ *
132
+ * @param simple_html_dom_node $anchor
133
+ * @return null
134
+ **/
135
+
136
+ function apply_attributes( $anchor ) {
137
+ # disable in feeds
138
+ if ( is_feed() )
139
+ return;
140
+
141
+ # ignore local urls
142
+ $url = ($anchor->tag == 'a') ? $anchor->href : $anchor->src;
143
+ if ( $this->is_local_url( $url ) )
144
+ return;
145
+
146
+ if ( isset($anchor->class) ) {
147
+ if ( stripos($anchor->class, 'external') === false )
148
+ $anchor->class .= ' external';
149
+ }
150
+ else
151
+ $anchor->class = 'external';
152
+
153
+ if ( $anchor->tag == 'a' && $this->opts['icon']
154
+ && ( stripos($anchor->class, 'external_icon') === false )
155
+ && ( stripos($anchor->class, 'no_icon') === false )
156
+ && ( stripos($anchor->class, 'noicon') === false ) )
157
+ $anchor->class .= ' external_icon';
158
+
159
+ if ( $this->opts['nofollow'] && !function_exists('strip_nofollow')
160
+ && ( stripos($anchor->rel, 'nofollow') === false )
161
+ && ( stripos($anchor->rel, 'follow') === false ) )
162
+ $anchor->rel = 'nofollow';
163
+
164
+ if ( $this->opts['target'] && !isset($anchor->target) )
165
+ $anchor->target = '_blank';
166
+ } # filter()
167
+
168
+
169
  /**
170
  * is_local_url()
171
  *
271
 
272
  $o = get_option('external_links');
273
 
274
+ if ( $o === false || !isset($o['text_widgets']) )
275
  $o = external_links::init_options();
276
 
277
  return $o;
292
  'icon' => true,
293
  'target' => false,
294
  'nofollow' => true,
295
+ 'text_widgets' => true,
296
  );
297
 
298
  if ( !$o )
simple_html_dom.php ADDED
@@ -0,0 +1,1742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ /**
3
+ * Website: http://sourceforge.net/projects/simplehtmldom/
4
+ * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
5
+ * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6
+ * Contributions by:
7
+ * Yousuke Kumakura (Attribute filters)
8
+ * Vadim Voituk (Negative indexes supports of "find" method)
9
+ * Antcs (Constructor with automatically load contents either text or file/url)
10
+ *
11
+ * all affected sections have comments starting with "PaperG"
12
+ *
13
+ * Paperg - Added case insensitive testing of the value of the selector.
14
+ * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
15
+ * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
16
+ * it will almost always be smaller by some amount.
17
+ * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
18
+ * but for most purposes, it's a really good estimation.
19
+ * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
20
+ * Allow the user to tell us how much they trust the html.
21
+ * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
22
+ * This allows for us to find tags based on the text they contain.
23
+ * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
24
+ * Paperg: added parse_charset so that we know about the character set of the source document.
25
+ * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
26
+ * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
27
+ *
28
+ * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
29
+ * PaperG (John Schlick) Added get_display_size for "IMG" tags.
30
+ *
31
+ * Licensed under The MIT License
32
+ * Redistributions of files must retain the above copyright notice.
33
+ *
34
+ * @author S.C. Chen <me578022@gmail.com>
35
+ * @author John Schlick
36
+ * @author Rus Carroll
37
+ * @version 1.5 ($Rev: 208 $)
38
+ * @package PlaceLocalInclude
39
+ * @subpackage simple_html_dom
40
+ */
41
+
42
+ /**
43
+ * All of the Defines for the classes below.
44
+ * @author S.C. Chen <me578022@gmail.com>
45
+ */
46
+ define('HDOM_TYPE_ELEMENT', 1);
47
+ define('HDOM_TYPE_COMMENT', 2);
48
+ define('HDOM_TYPE_TEXT', 3);
49
+ define('HDOM_TYPE_ENDTAG', 4);
50
+ define('HDOM_TYPE_ROOT', 5);
51
+ define('HDOM_TYPE_UNKNOWN', 6);
52
+ define('HDOM_QUOTE_DOUBLE', 0);
53
+ define('HDOM_QUOTE_SINGLE', 1);
54
+ define('HDOM_QUOTE_NO', 3);
55
+ define('HDOM_INFO_BEGIN', 0);
56
+ define('HDOM_INFO_END', 1);
57
+ define('HDOM_INFO_QUOTE', 2);
58
+ define('HDOM_INFO_SPACE', 3);
59
+ define('HDOM_INFO_TEXT', 4);
60
+ define('HDOM_INFO_INNER', 5);
61
+ define('HDOM_INFO_OUTER', 6);
62
+ define('HDOM_INFO_ENDSPACE',7);
63
+ define('DEFAULT_TARGET_CHARSET', 'UTF-8');
64
+ define('DEFAULT_BR_TEXT', "\r\n");
65
+ define('DEFAULT_SPAN_TEXT', " ");
66
+ define('MAX_FILE_SIZE', 600000);
67
+ // helper functions
68
+ // -----------------------------------------------------------------------------
69
+ // get html dom from file
70
+ // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
71
+ function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
72
+ {
73
+ // We DO force the tags to be terminated.
74
+ $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
75
+ // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
76
+ $contents = file_get_contents($url, $use_include_path, $context, $offset);
77
+ // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
78
+ //$contents = retrieve_url_contents($url);
79
+ if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
80
+ {
81
+ return false;
82
+ }
83
+ // The second parameter can force the selectors to all be lowercase.
84
+ $dom->load($contents, $lowercase, $stripRN);
85
+ return $dom;
86
+ }
87
+
88
+ // get html dom from string
89
+ function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
90
+ {
91
+ $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
92
+ if (empty($str) || strlen($str) > MAX_FILE_SIZE)
93
+ {
94
+ $dom->clear();
95
+ return false;
96
+ }
97
+ $dom->load($str, $lowercase, $stripRN);
98
+ return $dom;
99
+ }
100
+
101
+ // dump html dom tree
102
+ function dump_html_tree($node, $show_attr=true, $deep=0)
103
+ {
104
+ $node->dump($node);
105
+ }
106
+
107
+
108
+ /**
109
+ * simple html dom node
110
+ * PaperG - added ability for "find" routine to lowercase the value of the selector.
111
+ * PaperG - added $tag_start to track the start position of the tag in the total byte index
112
+ *
113
+ * @package PlaceLocalInclude
114
+ */
115
+ class simple_html_dom_node
116
+ {
117
+ public $nodetype = HDOM_TYPE_TEXT;
118
+ public $tag = 'text';
119
+ public $attr = array();
120
+ public $children = array();
121
+ public $nodes = array();
122
+ public $parent = null;
123
+ // The "info" array - see HDOM_INFO_... for what each element contains.
124
+ public $_ = array();
125
+ public $tag_start = 0;
126
+ private $dom = null;
127
+
128
+ function __construct($dom)
129
+ {
130
+ $this->dom = $dom;
131
+ $dom->nodes[] = $this;
132
+ }
133
+
134
+ function __destruct()
135
+ {
136
+ $this->clear();
137
+ }
138
+
139
+ function __toString()
140
+ {
141
+ return $this->outertext();
142
+ }
143
+
144
+ // clean up memory due to php5 circular references memory leak...
145
+ function clear()
146
+ {
147
+ $this->dom = null;
148
+ $this->nodes = null;
149
+ $this->parent = null;
150
+ $this->children = null;
151
+ }
152
+
153
+ // dump node's tree
154
+ function dump($show_attr=true, $deep=0)
155
+ {
156
+ $lead = str_repeat(' ', $deep);
157
+
158
+ echo $lead.$this->tag;
159
+ if ($show_attr && count($this->attr)>0)
160
+ {
161
+ echo '(';
162
+ foreach ($this->attr as $k=>$v)
163
+ echo "[$k]=>\"".$this->$k.'", ';
164
+ echo ')';
165
+ }
166
+ echo "\n";
167
+
168
+ if ($this->nodes)
169
+ {
170
+ foreach ($this->nodes as $c)
171
+ {
172
+ $c->dump($show_attr, $deep+1);
173
+ }
174
+ }
175
+ }
176
+
177
+
178
+ // Debugging function to dump a single dom node with a bunch of information about it.
179
+ function dump_node($echo=true)
180
+ {
181
+
182
+ $string = $this->tag;
183
+ if (count($this->attr)>0)
184
+ {
185
+ $string .= '(';
186
+ foreach ($this->attr as $k=>$v)
187
+ {
188
+ $string .= "[$k]=>\"".$this->$k.'", ';
189
+ }
190
+ $string .= ')';
191
+ }
192
+ if (count($this->_)>0)
193
+ {
194
+ $string .= ' $_ (';
195
+ foreach ($this->_ as $k=>$v)
196
+ {
197
+ if (is_array($v))
198
+ {
199
+ $string .= "[$k]=>(";
200
+ foreach ($v as $k2=>$v2)
201
+ {
202
+ $string .= "[$k2]=>\"".$v2.'", ';
203
+ }
204
+ $string .= ")";
205
+ } else {
206
+ $string .= "[$k]=>\"".$v.'", ';
207
+ }
208
+ }
209
+ $string .= ")";
210
+ }
211
+
212
+ if (isset($this->text))
213
+ {
214
+ $string .= " text: (" . $this->text . ")";
215
+ }
216
+
217
+ $string .= " HDOM_INNER_INFO: '";
218
+ if (isset($node->_[HDOM_INFO_INNER]))
219
+ {
220
+ $string .= $node->_[HDOM_INFO_INNER] . "'";
221
+ }
222
+ else
223
+ {
224
+ $string .= ' NULL ';
225
+ }
226
+
227
+ $string .= " children: " . count($this->children);
228
+ $string .= " nodes: " . count($this->nodes);
229
+ $string .= " tag_start: " . $this->tag_start;
230
+ $string .= "\n";
231
+
232
+ if ($echo)
233
+ {
234
+ echo $string;
235
+ return;
236
+ }
237
+ else
238
+ {
239
+ return $string;
240
+ }
241
+ }
242
+
243
+ // returns the parent of node
244
+ // If a node is passed in, it will reset the parent of the current node to that one.
245
+ function parent($parent=null)
246
+ {
247
+ // I am SURE that this doesn't work properly.
248
+ // It fails to unset the current node from it's current parents nodes or children list first.
249
+ if ($parent !== null)
250
+ {
251
+ $this->parent = $parent;
252
+ $this->parent->nodes[] = $this;
253
+ $this->parent->children[] = $this;
254
+ }
255
+
256
+ return $this->parent;
257
+ }
258
+
259
+ // verify that node has children
260
+ function has_child()
261
+ {
262
+ return !empty($this->children);
263
+ }
264
+
265
+ // returns children of node
266
+ function children($idx=-1)
267
+ {
268
+ if ($idx===-1)
269
+ {
270
+ return $this->children;
271
+ }
272
+ if (isset($this->children[$idx]))
273
+ {
274
+ return $this->children[$idx];
275
+ }
276
+ return null;
277
+ }
278
+
279
+ // returns the first child of node
280
+ function first_child()
281
+ {
282
+ if (count($this->children)>0)
283
+ {
284
+ return $this->children[0];
285
+ }
286
+ return null;
287
+ }
288
+
289
+ // returns the last child of node
290
+ function last_child()
291
+ {
292
+ if (($count=count($this->children))>0)
293
+ {
294
+ return $this->children[$count-1];
295
+ }
296
+ return null;
297
+ }
298
+
299
+ // returns the next sibling of node
300
+ function next_sibling()
301
+ {
302
+ if ($this->parent===null)
303
+ {
304
+ return null;
305
+ }
306
+
307
+ $idx = 0;
308
+ $count = count($this->parent->children);
309
+ while ($idx<$count && $this!==$this->parent->children[$idx])
310
+ {
311
+ ++$idx;
312
+ }
313
+ if (++$idx>=$count)
314
+ {
315
+ return null;
316
+ }
317
+ return $this->parent->children[$idx];
318
+ }
319
+
320
+ // returns the previous sibling of node
321
+ function prev_sibling()
322
+ {
323
+ if ($this->parent===null) return null;
324
+ $idx = 0;
325
+ $count = count($this->parent->children);
326
+ while ($idx<$count && $this!==$this->parent->children[$idx])
327
+ ++$idx;
328
+ if (--$idx<0) return null;
329
+ return $this->parent->children[$idx];
330
+ }
331
+
332
+ // function to locate a specific ancestor tag in the path to the root.
333
+ function find_ancestor_tag($tag)
334
+ {
335
+ global $debug_object;
336
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
337
+
338
+ // Start by including ourselves in the comparison.
339
+ $returnDom = $this;
340
+
341
+ while (!is_null($returnDom))
342
+ {
343
+ if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
344
+
345
+ if ($returnDom->tag == $tag)
346
+ {
347
+ break;
348
+ }
349
+ $returnDom = $returnDom->parent;
350
+ }
351
+ return $returnDom;
352
+ }
353
+
354
+ // get dom node's inner html
355
+ function innertext()
356
+ {
357
+ if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
358
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
359
+
360
+ $ret = '';
361
+ foreach ($this->nodes as $n)
362
+ $ret .= $n->outertext();
363
+ return $ret;
364
+ }
365
+
366
+ // get dom node's outer text (with tag)
367
+ function outertext()
368
+ {
369
+ global $debug_object;
370
+ if (is_object($debug_object))
371
+ {
372
+ $text = '';
373
+ if ($this->tag == 'text')
374
+ {
375
+ if (!empty($this->text))
376
+ {
377
+ $text = " with text: " . $this->text;
378
+ }
379
+ }
380
+ $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
381
+ }
382
+
383
+ if ($this->tag==='root') return $this->innertext();
384
+
385
+ // trigger callback
386
+ if ($this->dom && $this->dom->callback!==null)
387
+ {
388
+ call_user_func_array($this->dom->callback, array($this));
389
+ }
390
+
391
+ if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
392
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
393
+
394
+ // render begin tag
395
+ if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
396
+ {
397
+ $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
398
+ } else {
399
+ $ret = "";
400
+ }
401
+
402
+ // render inner text
403
+ if (isset($this->_[HDOM_INFO_INNER]))
404
+ {
405
+ // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
406
+ if ($this->tag != "br")
407
+ {
408
+ $ret .= $this->_[HDOM_INFO_INNER];
409
+ }
410
+ } else {
411
+ if ($this->nodes)
412
+ {
413
+ foreach ($this->nodes as $n)
414
+ {
415
+ $ret .= $this->convert_text($n->outertext());
416
+ }
417
+ }
418
+ }
419
+
420
+ // render end tag
421
+ if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
422
+ $ret .= '</'.$this->tag.'>';
423
+ return $ret;
424
+ }
425
+
426
+ // get dom node's plain text
427
+ function text()
428
+ {
429
+ if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
430
+ switch ($this->nodetype)
431
+ {
432
+ case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
433
+ case HDOM_TYPE_COMMENT: return '';
434
+ case HDOM_TYPE_UNKNOWN: return '';
435
+ }
436
+ if (strcasecmp($this->tag, 'script')===0) return '';
437
+ if (strcasecmp($this->tag, 'style')===0) return '';
438
+
439
+ $ret = '';
440
+ // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
441
+ // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
442
+ // WHY is this happening?
443
+ if (!is_null($this->nodes))
444
+ {
445
+ foreach ($this->nodes as $n)
446
+ {
447
+ $ret .= $this->convert_text($n->text());
448
+ }
449
+
450
+ // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
451
+ if ($this->tag == "span")
452
+ {
453
+ $ret .= $this->dom->default_span_text;
454
+ }
455
+
456
+
457
+ }
458
+ return $ret;
459
+ }
460
+
461
+ function xmltext()
462
+ {
463
+ $ret = $this->innertext();
464
+ $ret = str_ireplace('<![CDATA[', '', $ret);
465
+ $ret = str_replace(']]>', '', $ret);
466
+ return $ret;
467
+ }
468
+
469
+ // build node's text with tag
470
+ function makeup()
471
+ {
472
+ // text, comment, unknown
473
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
474
+
475
+ $ret = '<'.$this->tag;
476
+ $i = -1;
477
+
478
+ foreach ($this->attr as $key=>$val)
479
+ {
480
+ ++$i;
481
+
482
+ // skip removed attribute
483
+ if ($val===null || $val===false)
484
+ continue;
485
+
486
+ $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
487
+ //no value attr: nowrap, checked selected...
488
+ if ($val===true)
489
+ $ret .= $key;
490
+ else {
491
+ switch ($this->_[HDOM_INFO_QUOTE][$i])
492
+ {
493
+ case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
494
+ case HDOM_QUOTE_SINGLE: $quote = '\''; break;
495
+ default: $quote = '';
496
+ }
497
+ $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
498
+ }
499
+ }
500
+ $ret = $this->dom->restore_noise($ret);
501
+ return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
502
+ }
503
+
504
+ // find elements by css selector
505
+ //PaperG - added ability for find to lowercase the value of the selector.
506
+ function find($selector, $idx=null, $lowercase=false)
507
+ {
508
+ $selectors = $this->parse_selector($selector);
509
+ if (($count=count($selectors))===0) return array();
510
+ $found_keys = array();
511
+
512
+ // find each selector
513
+ for ($c=0; $c<$count; ++$c)
514
+ {
515
+ // The change on the below line was documented on the sourceforge code tracker id 2788009
516
+ // used to be: if (($levle=count($selectors[0]))===0) return array();
517
+ if (($levle=count($selectors[$c]))===0) return array();
518
+ if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
519
+
520
+ $head = array($this->_[HDOM_INFO_BEGIN]=>1);
521
+
522
+ // handle descendant selectors, no recursive!
523
+ for ($l=0; $l<$levle; ++$l)
524
+ {
525
+ $ret = array();
526
+ foreach ($head as $k=>$v)
527
+ {
528
+ $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
529
+ //PaperG - Pass this optional parameter on to the seek function.
530
+ $n->seek($selectors[$c][$l], $ret, $lowercase);
531
+ }
532
+ $head = $ret;
533
+ }
534
+
535
+ foreach ($head as $k=>$v)
536
+ {
537
+ if (!isset($found_keys[$k]))
538
+ {
539
+ $found_keys[$k] = 1;
540
+ }
541
+ }
542
+ }
543
+
544
+ // sort keys
545
+ ksort($found_keys);
546
+
547
+ $found = array();
548
+ foreach ($found_keys as $k=>$v)
549
+ $found[] = $this->dom->nodes[$k];
550
+
551
+ // return nth-element or array
552
+ if (is_null($idx)) return $found;
553
+ else if ($idx<0) $idx = count($found) + $idx;
554
+ return (isset($found[$idx])) ? $found[$idx] : null;
555
+ }
556
+
557
+ // seek for given conditions
558
+ // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
559
+ protected function seek($selector, &$ret, $lowercase=false)
560
+ {
561
+ global $debug_object;
562
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
563
+
564
+ list($tag, $key, $val, $exp, $no_key) = $selector;
565
+
566
+ // xpath index
567
+ if ($tag && $key && is_numeric($key))
568
+ {
569
+ $count = 0;
570
+ foreach ($this->children as $c)
571
+ {
572
+ if ($tag==='*' || $tag===$c->tag) {
573
+ if (++$count==$key) {
574
+ $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
575
+ return;
576
+ }
577
+ }
578
+ }
579
+ return;
580
+ }
581
+
582
+ $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
583
+ if ($end==0) {
584
+ $parent = $this->parent;
585
+ while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
586
+ $end -= 1;
587
+ $parent = $parent->parent;
588
+ }
589
+ $end += $parent->_[HDOM_INFO_END];
590
+ }
591
+
592
+ for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
593
+ $node = $this->dom->nodes[$i];
594
+
595
+ $pass = true;
596
+
597
+ if ($tag==='*' && !$key) {
598
+ if (in_array($node, $this->children, true))
599
+ $ret[$i] = 1;
600
+ continue;
601
+ }
602
+
603
+ // compare tag
604
+ if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
605
+ // compare key
606
+ if ($pass && $key) {
607
+ if ($no_key) {
608
+ if (isset($node->attr[$key])) $pass=false;
609
+ } else {
610
+ if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
611
+ }
612
+ }
613
+ // compare value
614
+ if ($pass && $key && $val && $val!=='*') {
615
+ // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
616
+ if ($key == "plaintext") {
617
+ // $node->plaintext actually returns $node->text();
618
+ $nodeKeyValue = $node->text();
619
+ } else {
620
+ // this is a normal search, we want the value of that attribute of the tag.
621
+ $nodeKeyValue = $node->attr[$key];
622
+ }
623
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
624
+
625
+ //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
626
+ if ($lowercase) {
627
+ $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
628
+ } else {
629
+ $check = $this->match($exp, $val, $nodeKeyValue);
630
+ }
631
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
632
+
633
+ // handle multiple class
634
+ if (!$check && strcasecmp($key, 'class')===0) {
635
+ foreach (explode(' ',$node->attr[$key]) as $k) {
636
+ // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
637
+ if (!empty($k)) {
638
+ if ($lowercase) {
639
+ $check = $this->match($exp, strtolower($val), strtolower($k));
640
+ } else {
641
+ $check = $this->match($exp, $val, $k);
642
+ }
643
+ if ($check) break;
644
+ }
645
+ }
646
+ }
647
+ if (!$check) $pass = false;
648
+ }
649
+ if ($pass) $ret[$i] = 1;
650
+ unset($node);
651
+ }
652
+ // It's passed by reference so this is actually what this function returns.
653
+ if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
654
+ }
655
+
656
+ protected function match($exp, $pattern, $value) {
657
+ global $debug_object;
658
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
659
+
660
+ switch ($exp) {
661
+ case '=':
662
+ return ($value===$pattern);
663
+ case '!=':
664
+ return ($value!==$pattern);
665
+ case '^=':
666
+ return preg_match("/^".preg_quote($pattern,'/')."/", $value);
667
+ case '$=':
668
+ return preg_match("/".preg_quote($pattern,'/')."$/", $value);
669
+ case '*=':
670
+ if ($pattern[0]=='/') {
671
+ return preg_match($pattern, $value);
672
+ }
673
+ return preg_match("/".$pattern."/i", $value);
674
+ }
675
+ return false;
676
+ }
677
+
678
+ protected function parse_selector($selector_string) {
679
+ global $debug_object;
680
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
681
+
682
+ // pattern of CSS selectors, modified from mootools
683
+ // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
684
+ // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
685
+ // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
686
+ // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
687
+ // farther study is required to determine of this should be documented or removed.
688
+ // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
689
+ $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
690
+ preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
691
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
692
+
693
+ $selectors = array();
694
+ $result = array();
695
+ //print_r($matches);
696
+
697
+ foreach ($matches as $m) {
698
+ $m[0] = trim($m[0]);
699
+ if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
700
+ // for browser generated xpath
701
+ if ($m[1]==='tbody') continue;
702
+
703
+ list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
704
+ if (!empty($m[2])) {$key='id'; $val=$m[2];}
705
+ if (!empty($m[3])) {$key='class'; $val=$m[3];}
706
+ if (!empty($m[4])) {$key=$m[4];}
707
+ if (!empty($m[5])) {$exp=$m[5];}
708
+ if (!empty($m[6])) {$val=$m[6];}
709
+
710
+ // convert to lowercase
711
+ if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
712
+ //elements that do NOT have the specified attribute
713
+ if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
714
+
715
+ $result[] = array($tag, $key, $val, $exp, $no_key);
716
+ if (trim($m[7])===',') {
717
+ $selectors[] = $result;
718
+ $result = array();
719
+ }
720
+ }
721
+ if (count($result)>0)
722
+ $selectors[] = $result;
723
+ return $selectors;
724
+ }
725
+
726
+ function __get($name)
727
+ {
728
+ if (isset($this->attr[$name]))
729
+ {
730
+ return $this->convert_text($this->attr[$name]);
731
+ }
732
+ switch ($name)
733
+ {
734
+ case 'outertext': return $this->outertext();
735
+ case 'innertext': return $this->innertext();
736
+ case 'plaintext': return $this->text();
737
+ case 'xmltext': return $this->xmltext();
738
+ default: return array_key_exists($name, $this->attr);
739
+ }
740
+ }
741
+
742
+ function __set($name, $value)
743
+ {
744
+ global $debug_object;
745
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
746
+
747
+ switch ($name)
748
+ {
749
+ case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
750
+ case 'innertext':
751
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
752
+ return $this->_[HDOM_INFO_INNER] = $value;
753
+ }
754
+ if (!isset($this->attr[$name]))
755
+ {
756
+ $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
757
+ $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
758
+ }
759
+ $this->attr[$name] = $value;
760
+ }
761
+
762
+ function __isset($name)
763
+ {
764
+ switch ($name)
765
+ {
766
+ case 'outertext': return true;
767
+ case 'innertext': return true;
768
+ case 'plaintext': return true;
769
+ }
770
+ //no value attr: nowrap, checked selected...
771
+ return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
772
+ }
773
+
774
+ function __unset($name) {
775
+ if (isset($this->attr[$name]))
776
+ unset($this->attr[$name]);
777
+ }
778
+
779
+ // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
780
+ function convert_text($text)
781
+ {
782
+ global $debug_object;
783
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
784
+
785
+ $converted_text = $text;
786
+
787
+ $sourceCharset = "";
788
+ $targetCharset = "";
789
+
790
+ if ($this->dom)
791
+ {
792
+ $sourceCharset = strtoupper($this->dom->_charset);
793
+ $targetCharset = strtoupper($this->dom->_target_charset);
794
+ }
795
+ if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
796
+
797
+ if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
798
+ {
799
+ // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
800
+ if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
801
+ {
802
+ $converted_text = $text;
803
+ }
804
+ else
805
+ {
806
+ $converted_text = iconv($sourceCharset, $targetCharset, $text);
807
+ }
808
+ }
809
+
810
+ // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
811
+ if ($targetCharset == 'UTF-8')
812
+ {
813
+ if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
814
+ {
815
+ $converted_text = substr($converted_text, 3);
816
+ }
817
+ if (substr($converted_text, -3) == "\xef\xbb\xbf")
818
+ {
819
+ $converted_text = substr($converted_text, 0, -3);
820
+ }
821
+ }
822
+
823
+ return $converted_text;
824
+ }
825
+
826
+ /**
827
+ * Returns true if $string is valid UTF-8 and false otherwise.
828
+ *
829
+ * @param mixed $str String to be tested
830
+ * @return boolean
831
+ */
832
+ static function is_utf8($str)
833
+ {
834
+ $c=0; $b=0;
835
+ $bits=0;
836
+ $len=strlen($str);
837
+ for($i=0; $i<$len; $i++)
838
+ {
839
+ $c=ord($str[$i]);
840
+ if($c > 128)
841
+ {
842
+ if(($c >= 254)) return false;
843
+ elseif($c >= 252) $bits=6;
844
+ elseif($c >= 248) $bits=5;
845
+ elseif($c >= 240) $bits=4;
846
+ elseif($c >= 224) $bits=3;
847
+ elseif($c >= 192) $bits=2;
848
+ else return false;
849
+ if(($i+$bits) > $len) return false;
850
+ while($bits > 1)
851
+ {
852
+ $i++;
853
+ $b=ord($str[$i]);
854
+ if($b < 128 || $b > 191) return false;
855
+ $bits--;
856
+ }
857
+ }
858
+ }
859
+ return true;
860
+ }
861
+ /*
862
+ function is_utf8($string)
863
+ {
864
+ //this is buggy
865
+ return (utf8_encode(utf8_decode($string)) == $string);
866
+ }
867
+ */
868
+
869
+ /**
870
+ * Function to try a few tricks to determine the displayed size of an img on the page.
871
+ * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
872
+ *
873
+ * @author John Schlick
874
+ * @version April 19 2012
875
+ * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
876
+ */
877
+ function get_display_size()
878
+ {
879
+ global $debug_object;
880
+
881
+ $width = -1;
882
+ $height = -1;
883
+
884
+ if ($this->tag !== 'img')
885
+ {
886
+ return false;
887
+ }
888
+
889
+ // See if there is aheight or width attribute in the tag itself.
890
+ if (isset($this->attr['width']))
891
+ {
892
+ $width = $this->attr['width'];
893
+ }
894
+
895
+ if (isset($this->attr['height']))
896
+ {
897
+ $height = $this->attr['height'];
898
+ }
899
+
900
+ // Now look for an inline style.
901
+ if (isset($this->attr['style']))
902
+ {
903
+ // Thanks to user gnarf from stackoverflow for this regular expression.
904
+ $attributes = array();
905
+ preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
906
+ foreach ($matches as $match) {
907
+ $attributes[$match[1]] = $match[2];
908
+ }
909
+
910
+ // If there is a width in the style attributes:
911
+ if (isset($attributes['width']) && $width == -1)
912
+ {
913
+ // check that the last two characters are px (pixels)
914
+ if (strtolower(substr($attributes['width'], -2)) == 'px')
915
+ {
916
+ $proposed_width = substr($attributes['width'], 0, -2);
917
+ // Now make sure that it's an integer and not something stupid.
918
+ if (filter_var($proposed_width, FILTER_VALIDATE_INT))
919
+ {
920
+ $width = $proposed_width;
921
+ }
922
+ }
923
+ }
924
+
925
+ // If there is a width in the style attributes:
926
+ if (isset($attributes['height']) && $height == -1)
927
+ {
928
+ // check that the last two characters are px (pixels)
929
+ if (strtolower(substr($attributes['height'], -2)) == 'px')
930
+ {
931
+ $proposed_height = substr($attributes['height'], 0, -2);
932
+ // Now make sure that it's an integer and not something stupid.
933
+ if (filter_var($proposed_height, FILTER_VALIDATE_INT))
934
+ {
935
+ $height = $proposed_height;
936
+ }
937
+ }
938
+ }
939
+
940
+ }
941
+
942
+ // Future enhancement:
943
+ // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
944
+
945
+ // Far future enhancement
946
+ // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
947
+ // Note that in this case, the class or id will have the img subselector for it to apply to the image.
948
+
949
+ // ridiculously far future development
950
+ // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
951
+
952
+ $result = array('height' => $height,
953
+ 'width' => $width);
954
+ return $result;
955
+ }
956
+
957
+ // camel naming conventions
958
+ function getAllAttributes() {return $this->attr;}
959
+ function getAttribute($name) {return $this->__get($name);}
960
+ function setAttribute($name, $value) {$this->__set($name, $value);}
961
+ function hasAttribute($name) {return $this->__isset($name);}
962
+ function removeAttribute($name) {$this->__set($name, null);}
963
+ function getElementById($id) {return $this->find("#$id", 0);}
964
+ function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
965
+ function getElementByTagName($name) {return $this->find($name, 0);}
966
+ function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
967
+ function parentNode() {return $this->parent();}
968
+ function childNodes($idx=-1) {return $this->children($idx);}
969
+ function firstChild() {return $this->first_child();}
970
+ function lastChild() {return $this->last_child();}
971
+ function nextSibling() {return $this->next_sibling();}
972
+ function previousSibling() {return $this->prev_sibling();}
973
+ function hasChildNodes() {return $this->has_child();}
974
+ function nodeName() {return $this->tag;}
975
+ function appendChild($node) {$node->parent($this); return $node;}
976
+
977
+ }
978
+
979
+ /**
980
+ * simple html dom parser
981
+ * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
982
+ * Paperg - change $size from protected to public so we can easily access it
983
+ * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
984
+ *
985
+ * @package PlaceLocalInclude
986
+ */
987
+ class simple_html_dom
988
+ {
989
+ public $root = null;
990
+ public $nodes = array();
991
+ public $callback = null;
992
+ public $lowercase = false;
993
+ // Used to keep track of how large the text was when we started.
994
+ public $original_size;
995
+ public $size;
996
+ protected $pos;
997
+ protected $doc;
998
+ protected $char;
999
+ protected $cursor;
1000
+ protected $parent;
1001
+ protected $noise = array();
1002
+ protected $token_blank = " \t\r\n";
1003
+ protected $token_equal = ' =/>';
1004
+ protected $token_slash = " />\r\n\t";
1005
+ protected $token_attr = ' >';
1006
+ // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1007
+ public $_charset = '';
1008
+ public $_target_charset = '';
1009
+ protected $default_br_text = "";
1010
+ public $default_span_text = "";
1011
+
1012
+ // use isset instead of in_array, performance boost about 30%...
1013
+ protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
1014
+ protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1015
+ // Known sourceforge issue #2977341
1016
+ // B tags that are not closed cause us to return everything to the end of the document.
1017
+ protected $optional_closing_tags = array(
1018
+ 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1019
+ 'th'=>array('th'=>1),
1020
+ 'td'=>array('td'=>1),
1021
+ 'li'=>array('li'=>1),
1022
+ 'dt'=>array('dt'=>1, 'dd'=>1),
1023
+ 'dd'=>array('dd'=>1, 'dt'=>1),
1024
+ 'dl'=>array('dd'=>1, 'dt'=>1),
1025
+ 'p'=>array('p'=>1),
1026
+ 'nobr'=>array('nobr'=>1),
1027
+ 'b'=>array('b'=>1),
1028
+ 'option'=>array('option'=>1),
1029
+ );
1030
+
1031
+ function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1032
+ {
1033
+ if ($str)
1034
+ {
1035
+ if (preg_match("/^http:\/\//i",$str) || is_file($str))
1036
+ {
1037
+ $this->load_file($str);
1038
+ }
1039
+ else
1040
+ {
1041
+ $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1042
+ }
1043
+ }
1044
+ // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1045
+ if (!$forceTagsClosed) {
1046
+ $this->optional_closing_array=array();
1047
+ }
1048
+ $this->_target_charset = $target_charset;
1049
+ }
1050
+
1051
+ function __destruct()
1052
+ {
1053
+ $this->clear();
1054
+ }
1055
+
1056
+ // load html from string
1057
+ function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1058
+ {
1059
+ global $debug_object;
1060
+
1061
+ // prepare
1062
+ $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1063
+ // strip out cdata
1064
+ $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1065
+ // strip out comments
1066
+ $this->remove_noise("'<!--(.*?)-->'is");
1067
+ // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1068
+ // Script tags removal now preceeds style tag removal.
1069
+ // strip out <script> tags
1070
+ $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1071
+ $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1072
+ // strip out <style> tags
1073
+ $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1074
+ $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1075
+ // strip out preformatted tags
1076
+ $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1077
+ // strip out server side scripts
1078
+ $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1079
+ // strip smarty scripts
1080
+ $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1081
+
1082
+ // parsing
1083
+ while ($this->parse());
1084
+ // end
1085
+ $this->root->_[HDOM_INFO_END] = $this->cursor;
1086
+ $this->parse_charset();
1087
+
1088
+ // make load function chainable
1089
+ return $this;
1090
+
1091
+ }
1092
+
1093
+ // load html from file
1094
+ function load_file()
1095
+ {
1096
+ $args = func_get_args();
1097
+ $this->load(call_user_func_array('file_get_contents', $args), true);
1098
+ // Throw an error if we can't properly load the dom.
1099
+ if (($error=error_get_last())!==null) {
1100
+ $this->clear();
1101
+ return false;
1102
+ }
1103
+ }
1104
+
1105
+ // set callback function
1106
+ function set_callback($function_name)
1107
+ {
1108
+ $this->callback = $function_name;
1109
+ }
1110
+
1111
+ // remove callback function
1112
+ function remove_callback()
1113
+ {
1114
+ $this->callback = null;
1115
+ }
1116
+
1117
+ // save dom as string
1118
+ function save($filepath='')
1119
+ {
1120
+ $ret = $this->root->innertext();
1121
+ if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1122
+ return $ret;
1123
+ }
1124
+
1125
+ // find dom node by css selector
1126
+ // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1127
+ function find($selector, $idx=null, $lowercase=false)
1128
+ {
1129
+ return $this->root->find($selector, $idx, $lowercase);
1130
+ }
1131
+
1132
+ // clean up memory due to php5 circular references memory leak...
1133
+ function clear()
1134
+ {
1135
+ foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1136
+ // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1137
+ if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1138
+ if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1139
+ if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1140
+ unset($this->doc);
1141
+ unset($this->noise);
1142
+ }
1143
+
1144
+ function dump($show_attr=true)
1145
+ {
1146
+ $this->root->dump($show_attr);
1147
+ }
1148
+
1149
+ // prepare HTML data and init everything
1150
+ protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1151
+ {
1152
+ $this->clear();
1153
+
1154
+ // set the length of content before we do anything to it.
1155
+ $this->size = strlen($str);
1156
+ // Save the original size of the html that we got in. It might be useful to someone.
1157
+ $this->original_size = $this->size;
1158
+
1159
+ //before we save the string as the doc... strip out the \r \n's if we are told to.
1160
+ if ($stripRN) {
1161
+ $str = str_replace("\r", " ", $str);
1162
+ $str = str_replace("\n", " ", $str);
1163
+
1164
+ // set the length of content since we have changed it.
1165
+ $this->size = strlen($str);
1166
+ }
1167
+
1168
+ $this->doc = $str;
1169
+ $this->pos = 0;
1170
+ $this->cursor = 1;
1171
+ $this->noise = array();
1172
+ $this->nodes = array();
1173
+ $this->lowercase = $lowercase;
1174
+ $this->default_br_text = $defaultBRText;
1175
+ $this->default_span_text = $defaultSpanText;
1176
+ $this->root = new simple_html_dom_node($this);
1177
+ $this->root->tag = 'root';
1178
+ $this->root->_[HDOM_INFO_BEGIN] = -1;
1179
+ $this->root->nodetype = HDOM_TYPE_ROOT;
1180
+ $this->parent = $this->root;
1181
+ if ($this->size>0) $this->char = $this->doc[0];
1182
+ }
1183
+
1184
+ // parse html content
1185
+ protected function parse()
1186
+ {
1187
+ if (($s = $this->copy_until_char('<'))==='')
1188
+ {
1189
+ return $this->read_tag();
1190
+ }
1191
+
1192
+ // text
1193
+ $node = new simple_html_dom_node($this);
1194
+ ++$this->cursor;
1195
+ $node->_[HDOM_INFO_TEXT] = $s;
1196
+ $this->link_nodes($node, false);
1197
+ return true;
1198
+ }
1199
+
1200
+ // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1201
+ // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1202
+ // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1203
+ protected function parse_charset()
1204
+ {
1205
+ global $debug_object;
1206
+
1207
+ $charset = null;
1208
+
1209
+ if (function_exists('get_last_retrieve_url_contents_content_type'))
1210
+ {
1211
+ $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1212
+ $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1213
+ if ($success)
1214
+ {
1215
+ $charset = $matches[1];
1216
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1217
+ }
1218
+
1219
+ }
1220
+
1221
+ if (empty($charset))
1222
+ {
1223
+ $el = $this->root->find('meta[http-equiv=Content-Type]',0);
1224
+ if (!empty($el))
1225
+ {
1226
+ $fullvalue = $el->content;
1227
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1228
+
1229
+ if (!empty($fullvalue))
1230
+ {
1231
+ $success = preg_match('/charset=(.+)/', $fullvalue, $matches);
1232
+ if ($success)
1233
+ {
1234
+ $charset = $matches[1];
1235
+ }
1236
+ else
1237
+ {
1238
+ // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1239
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1240
+ $charset = 'ISO-8859-1';
1241
+ }
1242
+ }
1243
+ }
1244
+ }
1245
+
1246
+ // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1247
+ if (empty($charset))
1248
+ {
1249
+ // Use this in case mb_detect_charset isn't installed/loaded on this machine.
1250
+ $charset = false;
1251
+ if (function_exists('mb_detect_encoding'))
1252
+ {
1253
+ // Have php try to detect the encoding from the text given to us.
1254
+ $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1255
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1256
+ }
1257
+
1258
+ // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1259
+ if ($charset === false)
1260
+ {
1261
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1262
+ $charset = 'UTF-8';
1263
+ }
1264
+ }
1265
+
1266
+ // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1267
+ if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1268
+ {
1269
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1270
+ $charset = 'CP1252';
1271
+ }
1272
+
1273
+ if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1274
+
1275
+ return $this->_charset = $charset;
1276
+ }
1277
+
1278
+ // read tag info
1279
+ protected function read_tag()
1280
+ {
1281
+ if ($this->char!=='<')
1282
+ {
1283
+ $this->root->_[HDOM_INFO_END] = $this->cursor;
1284
+ return false;
1285
+ }
1286
+ $begin_tag_pos = $this->pos;
1287
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1288
+
1289
+ // end tag
1290
+ if ($this->char==='/')
1291
+ {
1292
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1293
+ // This represents the change in the simple_html_dom trunk from revision 180 to 181.
1294
+ // $this->skip($this->token_blank_t);
1295
+ $this->skip($this->token_blank);
1296
+ $tag = $this->copy_until_char('>');
1297
+
1298
+ // skip attributes in end tag
1299
+ if (($pos = strpos($tag, ' '))!==false)
1300
+ $tag = substr($tag, 0, $pos);
1301
+
1302
+ $parent_lower = strtolower($this->parent->tag);
1303
+ $tag_lower = strtolower($tag);
1304
+
1305
+ if ($parent_lower!==$tag_lower)
1306
+ {
1307
+ if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1308
+ {
1309
+ $this->parent->_[HDOM_INFO_END] = 0;
1310
+ $org_parent = $this->parent;
1311
+
1312
+ while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1313
+ $this->parent = $this->parent->parent;
1314
+
1315
+ if (strtolower($this->parent->tag)!==$tag_lower) {
1316
+ $this->parent = $org_parent; // restore origonal parent
1317
+ if ($this->parent->parent) $this->parent = $this->parent->parent;
1318
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1319
+ return $this->as_text_node($tag);
1320
+ }
1321
+ }
1322
+ else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1323
+ {
1324
+ $this->parent->_[HDOM_INFO_END] = 0;
1325
+ $org_parent = $this->parent;
1326
+
1327
+ while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1328
+ $this->parent = $this->parent->parent;
1329
+
1330
+ if (strtolower($this->parent->tag)!==$tag_lower)
1331
+ {
1332
+ $this->parent = $org_parent; // restore origonal parent
1333
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1334
+ return $this->as_text_node($tag);
1335
+ }
1336
+ }
1337
+ else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1338
+ {
1339
+ $this->parent->_[HDOM_INFO_END] = 0;
1340
+ $this->parent = $this->parent->parent;
1341
+ }
1342
+ else
1343
+ return $this->as_text_node($tag);
1344
+ }
1345
+
1346
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1347
+ if ($this->parent->parent) $this->parent = $this->parent->parent;
1348
+
1349
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1350
+ return true;
1351
+ }
1352
+
1353
+ $node = new simple_html_dom_node($this);
1354
+ $node->_[HDOM_INFO_BEGIN] = $this->cursor;
1355
+ ++$this->cursor;
1356
+ $tag = $this->copy_until($this->token_slash);
1357
+ $node->tag_start = $begin_tag_pos;
1358
+
1359
+ // doctype, cdata & comments...
1360
+ if (isset($tag[0]) && $tag[0]==='!') {
1361
+ $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1362
+
1363
+ if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
1364
+ $node->nodetype = HDOM_TYPE_COMMENT;
1365
+ $node->tag = 'comment';
1366
+ } else {
1367
+ $node->nodetype = HDOM_TYPE_UNKNOWN;
1368
+ $node->tag = 'unknown';
1369
+ }
1370
+ if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1371
+ $this->link_nodes($node, true);
1372
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1373
+ return true;
1374
+ }
1375
+
1376
+ // text
1377
+ if ($pos=strpos($tag, '<')!==false) {
1378
+ $tag = '<' . substr($tag, 0, -1);
1379
+ $node->_[HDOM_INFO_TEXT] = $tag;
1380
+ $this->link_nodes($node, false);
1381
+ $this->char = $this->doc[--$this->pos]; // prev
1382
+ return true;
1383
+ }
1384
+
1385
+ if (!preg_match("/^[\w-:]+$/", $tag)) {
1386
+ $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1387
+ if ($this->char==='<') {
1388
+ $this->link_nodes($node, false);
1389
+ return true;
1390
+ }
1391
+
1392
+ if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1393
+ $this->link_nodes($node, false);
1394
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1395
+ return true;
1396
+ }
1397
+
1398
+ // begin tag
1399
+ $node->nodetype = HDOM_TYPE_ELEMENT;
1400
+ $tag_lower = strtolower($tag);
1401
+ $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1402
+
1403
+ // handle optional closing tags
1404
+ if (isset($this->optional_closing_tags[$tag_lower]) )
1405
+ {
1406
+ while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1407
+ {
1408
+ $this->parent->_[HDOM_INFO_END] = 0;
1409
+ $this->parent = $this->parent->parent;
1410
+ }
1411
+ $node->parent = $this->parent;
1412
+ }
1413
+
1414
+ $guard = 0; // prevent infinity loop
1415
+ $space = array($this->copy_skip($this->token_blank), '', '');
1416
+
1417
+ // attributes
1418
+ do
1419
+ {
1420
+ if ($this->char!==null && $space[0]==='')
1421
+ {
1422
+ break;
1423
+ }
1424
+ $name = $this->copy_until($this->token_equal);
1425
+ if ($guard===$this->pos)
1426
+ {
1427
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1428
+ continue;
1429
+ }
1430
+ $guard = $this->pos;
1431
+
1432
+ // handle endless '<'
1433
+ if ($this->pos>=$this->size-1 && $this->char!=='>') {
1434
+ $node->nodetype = HDOM_TYPE_TEXT;
1435
+ $node->_[HDOM_INFO_END] = 0;
1436
+ $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1437
+ $node->tag = 'text';
1438
+ $this->link_nodes($node, false);
1439
+ return true;
1440
+ }
1441
+
1442
+ // handle mismatch '<'
1443
+ if ($this->doc[$this->pos-1]=='<') {
1444
+ $node->nodetype = HDOM_TYPE_TEXT;
1445
+ $node->tag = 'text';
1446
+ $node->attr = array();
1447
+ $node->_[HDOM_INFO_END] = 0;
1448
+ $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1449
+ $this->pos -= 2;
1450
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1451
+ $this->link_nodes($node, false);
1452
+ return true;
1453
+ }
1454
+
1455
+ if ($name!=='/' && $name!=='') {
1456
+ $space[1] = $this->copy_skip($this->token_blank);
1457
+ $name = $this->restore_noise($name);
1458
+ if ($this->lowercase) $name = strtolower($name);
1459
+ if ($this->char==='=') {
1460
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1461
+ $this->parse_attr($node, $name, $space);
1462
+ }
1463
+ else {
1464
+ //no value attr: nowrap, checked selected...
1465
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1466
+ $node->attr[$name] = true;
1467
+ if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1468
+ }
1469
+ $node->_[HDOM_INFO_SPACE][] = $space;
1470
+ $space = array($this->copy_skip($this->token_blank), '', '');
1471
+ }
1472
+ else
1473
+ break;
1474
+ } while ($this->char!=='>' && $this->char!=='/');
1475
+
1476
+ $this->link_nodes($node, true);
1477
+ $node->_[HDOM_INFO_ENDSPACE] = $space[0];
1478
+
1479
+ // check self closing
1480
+ if ($this->copy_until_char_escape('>')==='/')
1481
+ {
1482
+ $node->_[HDOM_INFO_ENDSPACE] .= '/';
1483
+ $node->_[HDOM_INFO_END] = 0;
1484
+ }
1485
+ else
1486
+ {
1487
+ // reset parent
1488
+ if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1489
+ }
1490
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1491
+
1492
+ // If it's a BR tag, we need to set it's text to the default text.
1493
+ // This way when we see it in plaintext, we can generate formatting that the user wants.
1494
+ // since a br tag never has sub nodes, this works well.
1495
+ if ($node->tag == "br")
1496
+ {
1497
+ $node->_[HDOM_INFO_INNER] = $this->default_br_text;
1498
+ }
1499
+
1500
+ return true;
1501
+ }
1502
+
1503
+ // parse attributes
1504
+ protected function parse_attr($node, $name, &$space)
1505
+ {
1506
+ // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1507
+ // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1508
+ if (isset($node->attr[$name]))
1509
+ {
1510
+ return;
1511
+ }
1512
+
1513
+ $space[2] = $this->copy_skip($this->token_blank);
1514
+ switch ($this->char) {
1515
+ case '"':
1516
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1517
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1518
+ $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
1519
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1520
+ break;
1521
+ case '\'':
1522
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1523
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1524
+ $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
1525
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1526
+ break;
1527
+ default:
1528
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1529
+ $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1530
+ }
1531
+ // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1532
+ $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1533
+ $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1534
+ // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1535
+ if ($name == "class") {
1536
+ $node->attr[$name] = trim($node->attr[$name]);
1537
+ }
1538
+ }
1539
+
1540
+ // link node's parent
1541
+ protected function link_nodes(&$node, $is_child)
1542
+ {
1543
+ $node->parent = $this->parent;
1544
+ $this->parent->nodes[] = $node;
1545
+ if ($is_child)
1546
+ {
1547
+ $this->parent->children[] = $node;
1548
+ }
1549
+ }
1550
+
1551
+ // as a text node
1552
+ protected function as_text_node($tag)
1553
+ {
1554
+ $node = new simple_html_dom_node($this);
1555
+ ++$this->cursor;
1556
+ $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1557
+ $this->link_nodes($node, false);
1558
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1559
+ return true;
1560
+ }
1561
+
1562
+ protected function skip($chars)
1563
+ {
1564
+ $this->pos += strspn($this->doc, $chars, $this->pos);
1565
+ $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1566
+ }
1567
+
1568
+ protected function copy_skip($chars)
1569
+ {
1570
+ $pos = $this->pos;
1571
+ $len = strspn($this->doc, $chars, $pos);
1572
+ $this->pos += $len;
1573
+ $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1574
+ if ($len===0) return '';
1575
+ return substr($this->doc, $pos, $len);
1576
+ }
1577
+
1578
+ protected function copy_until($chars)
1579
+ {
1580
+ $pos = $this->pos;
1581
+ $len = strcspn($this->doc, $chars, $pos);
1582
+ $this->pos += $len;
1583
+ $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1584
+ return substr($this->doc, $pos, $len);
1585
+ }
1586
+
1587
+ protected function copy_until_char($char)
1588
+ {
1589
+ if ($this->char===null) return '';
1590
+
1591
+ if (($pos = strpos($this->doc, $char, $this->pos))===false) {
1592
+ $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1593
+ $this->char = null;
1594
+ $this->pos = $this->size;
1595
+ return $ret;
1596
+ }
1597
+
1598
+ if ($pos===$this->pos) return '';
1599
+ $pos_old = $this->pos;
1600
+ $this->char = $this->doc[$pos];
1601
+ $this->pos = $pos;
1602
+ return substr($this->doc, $pos_old, $pos-$pos_old);
1603
+ }
1604
+
1605
+ protected function copy_until_char_escape($char)
1606
+ {
1607
+ if ($this->char===null) return '';
1608
+
1609
+ $start = $this->pos;
1610
+ while (1)
1611
+ {
1612
+ if (($pos = strpos($this->doc, $char, $start))===false)
1613
+ {
1614
+ $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1615
+ $this->char = null;
1616
+ $this->pos = $this->size;
1617
+ return $ret;
1618
+ }
1619
+
1620
+ if ($pos===$this->pos) return '';
1621
+
1622
+ if ($this->doc[$pos-1]==='\\') {
1623
+ $start = $pos+1;
1624
+ continue;
1625
+ }
1626
+
1627
+ $pos_old = $this->pos;
1628
+ $this->char = $this->doc[$pos];
1629
+ $this->pos = $pos;
1630
+ return substr($this->doc, $pos_old, $pos-$pos_old);
1631
+ }
1632
+ }
1633
+
1634
+ // remove noise from html content
1635
+ // save the noise in the $this->noise array.
1636
+ protected function remove_noise($pattern, $remove_tag=false)
1637
+ {
1638
+ global $debug_object;
1639
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1640
+
1641
+ $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1642
+
1643
+ for ($i=$count-1; $i>-1; --$i)
1644
+ {
1645
+ $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1646
+ if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
1647
+ $idx = ($remove_tag) ? 0 : 1;
1648
+ $this->noise[$key] = $matches[$i][$idx][0];
1649
+ $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
1650
+ }
1651
+
1652
+ // reset the length of content
1653
+ $this->size = strlen($this->doc);
1654
+ if ($this->size>0)
1655
+ {
1656
+ $this->char = $this->doc[0];
1657
+ }
1658
+ }
1659
+
1660
+ // restore noise to html content
1661
+ function restore_noise($text)
1662
+ {
1663
+ global $debug_object;
1664
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1665
+
1666
+ while (($pos=strpos($text, '___noise___'))!==false)
1667
+ {
1668
+ // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1669
+ if (strlen($text) > $pos+15)
1670
+ {
1671
+ $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1672
+ if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
1673
+
1674
+ if (isset($this->noise[$key]))
1675
+ {
1676
+ $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
1677
+ }
1678
+ else
1679
+ {
1680
+ // do this to prevent an infinite loop.
1681
+ $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
1682
+ }
1683
+ }
1684
+ else
1685
+ {
1686
+ // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1687
+ $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
1688
+ }
1689
+ }
1690
+ return $text;
1691
+ }
1692
+
1693
+ // Sometimes we NEED one of the noise elements.
1694
+ function search_noise($text)
1695
+ {
1696
+ global $debug_object;
1697
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1698
+
1699
+ foreach($this->noise as $noiseElement)
1700
+ {
1701
+ if (strpos($noiseElement, $text)!==false)
1702
+ {
1703
+ return $noiseElement;
1704
+ }
1705
+ }
1706
+ }
1707
+ function __toString()
1708
+ {
1709
+ return $this->root->innertext();
1710
+ }
1711
+
1712
+ function __get($name)
1713
+ {
1714
+ switch ($name)
1715
+ {
1716
+ case 'outertext':
1717
+ return $this->root->innertext();
1718
+ case 'innertext':
1719
+ return $this->root->innertext();
1720
+ case 'plaintext':
1721
+ return $this->root->text();
1722
+ case 'charset':
1723
+ return $this->_charset;
1724
+ case 'target_charset':
1725
+ return $this->_target_charset;
1726
+ }
1727
+ }
1728
+
1729
+ // camel naming conventions
1730
+ function childNodes($idx=-1) {return $this->root->childNodes($idx);}
1731
+ function firstChild() {return $this->root->first_child();}
1732
+ function lastChild() {return $this->root->last_child();}
1733
+ function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
1734
+ function createTextNode($value) {return @end(str_get_html($value)->nodes);}
1735
+ function getElementById($id) {return $this->find("#$id", 0);}
1736
+ function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1737
+ function getElementByTagName($name) {return $this->find($name, 0);}
1738
+ function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
1739
+ function loadFile() {$args = func_get_args();$this->load_file($args);}
1740
+ }
1741
+
1742
+ ?>