Orphans - Version 3.1.0

Version Description

  • 2022-11-30=

  • Changed simple replacement by DOMDocument parsing & replacing only on string.

Download this release

Release Info

Developer iworks
Plugin Icon 128x128 Orphans
Version 3.1.0
Comparing to
See all releases

Code changes from version 3.0.4 to 3.1.0

includes/iworks/class-iworks-orphan.php CHANGED
@@ -48,7 +48,26 @@ class iworks_orphan {
48
  *
49
  * @since 3.0.0
50
  */
51
- private $version = '3.0.4';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  public function __construct() {
54
  /**
@@ -193,59 +212,52 @@ class iworks_orphan {
193
  }
194
 
195
  /**
196
- * Unconditional replacement with super-base check is replacement even
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  * possible.
198
  *
199
- * @since 2.7.8
200
  *
201
  * @param string $content String to replace
202
  *
203
  * @return string $content
204
  */
205
- private function unconditional_replacement( $content ) {
206
  /**
207
  * only super-base check
208
  */
209
  if ( ! is_string( $content ) || empty( $content ) ) {
210
  return $content;
211
  }
212
- /**
213
- * Avoid to replace attributes
214
- */
215
- $attributes = array();
216
- if ( $this->options->get_option( 'attributes' ) ) {
217
- preg_match_all( '/(style|class|data-[a-z\-]+)="[^"]+"/', $content, $matches );
218
- if ( ! empty( $matches ) ) {
219
- $salt = 'Sae9ieCheyieph3ug7si4yeiBoog0fae4yae6biexaimie0ied7quienae3yeepo';
220
- foreach ( $matches[0] as $value ) {
221
- if ( empty( $value ) ) {
222
- continue;
223
- }
224
- if ( preg_match( '/ /', $value ) ) {
225
- $attributes[ $value ] = md5( $salt . $value );
226
- }
227
- }
228
- if ( ! empty( $attributes ) ) {
229
- foreach ( $attributes as $part => $to_change ) {
230
- $content = str_replace( $part, $to_change, $content );
231
- }
232
- }
233
- }
234
- }
235
- /**
236
- * Avoid to replace inside script or styles tags
237
- */
238
- preg_match_all( '@(<(script|style)[^>]*>.*?(</(script|style)>))@is', $content, $matches );
239
- $exceptions = array();
240
- if ( ! empty( $matches ) && ! empty( $matches[0] ) ) {
241
- $salt = 'kQc6T9fn5GhEzTM3Sxn7b9TWMV4PO0mOCV06Da7AQJzSJqxYR4z3qBlsW9rtFsWK';
242
- foreach ( $matches[0] as $one ) {
243
- $key = sprintf( '<!-- %s %s -->', $salt, md5( $one ) );
244
- $exceptions[ $key ] = $one;
245
- $re = sprintf( '@%s@', preg_replace( '/@/', '\@', preg_quote( $one, '/' ) ) );
246
- $content = preg_replace( $re, $key, $content );
247
- }
248
- }
249
  /**
250
  * Keep numbers together - this is independed of current language
251
  */
@@ -257,7 +269,7 @@ class iworks_orphan {
257
  foreach ( $parts as $part ) {
258
  $to_change = $part;
259
  while ( preg_match( '/(\d+) ([\da-z]+)/i', $to_change, $matches ) ) {
260
- $to_change = preg_replace( '/(\d+) ([\da-z]+)/i', '$1&nbsp;$2', $to_change );
261
  }
262
  if ( $part != $to_change ) {
263
  $content = str_replace( $part, $to_change, $content );
@@ -288,16 +300,16 @@ class iworks_orphan {
288
  * base therms replace
289
  */
290
  $re = '/^([aiouwz]|' . preg_replace( '/\./', '\.', implode( '|', $terms ) ) . ') +/i';
291
- $part_to_change = preg_replace( $re, '$1$2&nbsp;', $part_to_change );
292
  /**
293
  * single letters
294
  */
295
- $re = '/([ >\(]+|&nbsp;|&#8222;|&quot;)([aiouwz]|' . preg_replace( '/\./', '\.', implode( '|', $terms ) ) . ') +/i';
296
  /**
297
  * double call to handle orphan after orphan after orphan
298
  */
299
- $part_to_change = preg_replace( $re, '$1$2&nbsp;', $part_to_change );
300
- $part_to_change = preg_replace( $re, '$1$2&nbsp;', $part_to_change );
301
  }
302
  if ( $part_source !== $part_to_change ) {
303
  $content = str_replace( $part_source, $part_to_change, $content );
@@ -306,8 +318,8 @@ class iworks_orphan {
306
  /**
307
  * single letter after previous orphan
308
  */
309
- $re = '/(&nbsp;)([aiouwz]) +/i';
310
- $content = preg_replace( $re, '$1$2&nbsp;', $content );
311
  /**
312
  * bring back styles & scripts
313
  */
@@ -330,13 +342,80 @@ class iworks_orphan {
330
  *
331
  * @since 3.0.4
332
  */
333
- $content = preg_replace( '/(\d) r\./', '$1&nbsp;r.', $content );
334
  /**
335
  * return
336
  */
337
  return $content;
338
  }
339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  /**
341
  * Inicialize admin area
342
  */
@@ -464,13 +543,21 @@ class iworks_orphan {
464
  * @since 3.0.2
465
  */
466
  add_filter( 'vc_shortcode_output', array( $this, 'replace' ) );
467
-
468
  /**
469
- * Integrations: Goodlayers Core
470
  *
471
- * @since 3.0.4
 
 
 
 
 
 
472
  */
473
- add_filter( 'gdlr_core_escape_content', array( $this, 'replace' ) );
 
 
 
474
  }
475
 
476
  /**
48
  *
49
  * @since 3.0.0
50
  */
51
+ private $version = '3.1.0';
52
+
53
+ /**
54
+ * tags to avoid replacement
55
+ *
56
+ * @since 3.1.0
57
+ */
58
+ private $protected_tags = array(
59
+ 'script',
60
+ 'style',
61
+ 'iframe',
62
+ 'svg',
63
+ );
64
+
65
+ /**
66
+ * nbsp placehlder
67
+ *
68
+ * @since 3.1.0
69
+ */
70
+ private $nbsp_placeholder = '&nbsp;';
71
 
72
  public function __construct() {
73
  /**
212
  }
213
 
214
  /**
215
+ * Parse DOMElement Object
216
+ *
217
+ * @since 3.1.0
218
+ */
219
+ private function parse_item( $item ) {
220
+ /**
221
+ * no tags, replace
222
+ */
223
+ if ( ! preg_match( '/</', $item->innertext ) ) {
224
+ $item->innertext = $this->string_replacement( $item->innertext );
225
+ return;
226
+ }
227
+ /**
228
+ * split to slices & replace!
229
+ */
230
+ preg_match_all( '/<[^>]+>/', $item->innertext, $matches );
231
+ $text_array = preg_split( '/<[^>]+>/', $item->innertext );
232
+ $text = '';
233
+ $max = sizeof( $text_array );
234
+ for ( $i = 0;$i < $max;$i++ ) {
235
+ $text .= $this->string_replacement( $text_array[ $i ] );
236
+ if ( isset( $matches[0][ $i ] ) ) {
237
+ $text .= $matches[0][ $i ];
238
+ }
239
+ }
240
+ $item->innertext = $text;
241
+ return;
242
+ }
243
+
244
+ /**
245
+ * String replacement with super-base check is replacement even
246
  * possible.
247
  *
248
+ * @since 3.1.0
249
  *
250
  * @param string $content String to replace
251
  *
252
  * @return string $content
253
  */
254
+ private function string_replacement( $content ) {
255
  /**
256
  * only super-base check
257
  */
258
  if ( ! is_string( $content ) || empty( $content ) ) {
259
  return $content;
260
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  /**
262
  * Keep numbers together - this is independed of current language
263
  */
269
  foreach ( $parts as $part ) {
270
  $to_change = $part;
271
  while ( preg_match( '/(\d+) ([\da-z]+)/i', $to_change, $matches ) ) {
272
+ $to_change = preg_replace( '/(\d+) ([\da-z]+)/i', '$1' . $this->nbsp_placeholder . '$2', $to_change );
273
  }
274
  if ( $part != $to_change ) {
275
  $content = str_replace( $part, $to_change, $content );
300
  * base therms replace
301
  */
302
  $re = '/^([aiouwz]|' . preg_replace( '/\./', '\.', implode( '|', $terms ) ) . ') +/i';
303
+ $part_to_change = preg_replace( $re, '$1$2' . $this->nbsp_placeholder, $part_to_change );
304
  /**
305
  * single letters
306
  */
307
+ $re = '/([ >\(]+|' . $this->nbsp_placeholder . '|&#8222;|&quot;)([aiouwz]|' . preg_replace( '/\./', '\.', implode( '|', $terms ) ) . ') +/i';
308
  /**
309
  * double call to handle orphan after orphan after orphan
310
  */
311
+ $part_to_change = preg_replace( $re, '$1$2' . $this->nbsp_placeholder, $part_to_change );
312
+ $part_to_change = preg_replace( $re, '$1$2' . $this->nbsp_placeholder, $part_to_change );
313
  }
314
  if ( $part_source !== $part_to_change ) {
315
  $content = str_replace( $part_source, $part_to_change, $content );
318
  /**
319
  * single letter after previous orphan
320
  */
321
+ $re = '/(' . $this->nbsp_placeholder . ')([aiouwz]) +/i';
322
+ $content = preg_replace( $re, '$1$2' . $this->nbsp_placeholder, $content );
323
  /**
324
  * bring back styles & scripts
325
  */
342
  *
343
  * @since 3.0.4
344
  */
345
+ $content = preg_replace( '/(\d) r\./', '$1' . $this->nbsp_placeholder . 'r.', $content );
346
  /**
347
  * return
348
  */
349
  return $content;
350
  }
351
 
352
+ /**
353
+ * Unconditional replacement with super-base check is replacement even
354
+ * possible.
355
+ *
356
+ * @since 2.7.8
357
+ * @since 3.1.0 - changed into DOM parsing
358
+ *
359
+ *
360
+ * @param string $content String to replace
361
+ *
362
+ * @return string $content
363
+ */
364
+ private function unconditional_replacement( $content ) {
365
+ if ( ! is_string( $content ) || empty( $content ) ) {
366
+ return $content;
367
+ }
368
+ /**
369
+ * string, no tags
370
+ */
371
+ if ( strip_tags( $content ) === $content ) {
372
+ return $this->string_replacement( $content );
373
+ }
374
+ /**
375
+ * parse
376
+ */
377
+ $doc = str_get_html( $content );
378
+ /**
379
+ * remove protected tags
380
+ */
381
+ $protected = array();
382
+ foreach ( $this->protected_tags as $tag ) {
383
+ foreach ( $doc->find( $tag ) as $item ) {
384
+ $innertext = $item->innertext;
385
+ $attributes = $item->getAllAttributes();
386
+ $key = md5( $tag . $innertext . implode( $attributes ) );
387
+ $protected[ $key ] = array(
388
+ 'attributes' => $attributes,
389
+ 'innertext' => $innertext,
390
+ );
391
+ foreach ( $attributes as $name => $value ) {
392
+ $item->removeAttribute( $name );
393
+ }
394
+ $item->innertext = '';
395
+ $item->setAttribute( 'id', 'orphans-' . $key );
396
+ }
397
+ }
398
+ /**
399
+ * replace
400
+ */
401
+ foreach ( $doc->find( '*' ) as &$item ) {
402
+ $this->parse_item( $item );
403
+ }
404
+ /**
405
+ * revert protected tags
406
+ */
407
+ foreach ( $protected as $key => $data ) {
408
+ foreach ( $doc->find( '#orphans-' . $key ) as $item ) {
409
+ $item->innertext = $data['innertext'];
410
+ $item->removeAttribute( 'id' );
411
+ foreach ( $data['attributes'] as $name => $value ) {
412
+ $item->setAttribute( $name, $value );
413
+ }
414
+ }
415
+ }
416
+ return $output = $doc->save();
417
+ }
418
+
419
  /**
420
  * Inicialize admin area
421
  */
543
  * @since 3.0.2
544
  */
545
  add_filter( 'vc_shortcode_output', array( $this, 'replace' ) );
 
546
  /**
547
+ * Filter allowed change protected tags.
548
  *
549
+ * @since 3.1.0
550
+ *
551
+ * @param array $args {
552
+ * Array of protected tags - all content of this tags will be not
553
+ * replaced
554
+ *
555
+ * @type string HTML tag name.
556
  */
557
+ $this->protected_tags = apply_filters(
558
+ 'iworks_orphan_protected_tags',
559
+ $this->protected_tags
560
+ );
561
  }
562
 
563
  /**
languages/sierotki.pot CHANGED
@@ -4,7 +4,7 @@ msgid ""
4
  msgstr ""
5
  "Project-Id-Version: Orphans PLUGIN_VERSION\n"
6
  "Report-Msgid-Bugs-To: https://wordpress.org/support/plugin/sierotki-dev\n"
7
- "POT-Creation-Date: 2022-11-20 19:14:56+00:00\n"
8
  "MIME-Version: 1.0\n"
9
  "Content-Type: text/plain; charset=utf-8\n"
10
  "Content-Transfer-Encoding: 8bit\n"
@@ -232,7 +232,7 @@ msgstr ""
232
  msgid "WordPress Help Forum"
233
  msgstr ""
234
 
235
- #: includes/iworks/class-iworks-orphan.php:497
236
  #: includes/iworks/rate/rate.php:124
237
  msgid "Settings"
238
  msgstr ""
4
  msgstr ""
5
  "Project-Id-Version: Orphans PLUGIN_VERSION\n"
6
  "Report-Msgid-Bugs-To: https://wordpress.org/support/plugin/sierotki-dev\n"
7
+ "POT-Creation-Date: 2022-12-14 12:21:37+00:00\n"
8
  "MIME-Version: 1.0\n"
9
  "Content-Type: text/plain; charset=utf-8\n"
10
  "Content-Transfer-Encoding: 8bit\n"
232
  msgid "WordPress Help Forum"
233
  msgstr ""
234
 
235
+ #: includes/iworks/class-iworks-orphan.php:584
236
  #: includes/iworks/rate/rate.php:124
237
  msgid "Settings"
238
  msgstr ""
readme.txt CHANGED
@@ -4,7 +4,7 @@ Donate link: https://ko-fi.com/iworks?utm_source=sierotki&utm_medium=readme-dona
4
  Tags: sierotka, sierotki, spójniki, twarda spacja
5
  Requires at least: 4.6
6
  Tested up to: 6.1
7
- Stable tag: 3.0.4
8
 
9
 
10
 
@@ -128,9 +128,12 @@ function remove_iworks_orphan_terms( $terms ) {
128
 
129
  == Changelog ==
130
 
131
- = 3.0.4 - 2022-11-20 =
 
 
 
 
132
  * Handle space after year for short year format "r.". Props for [Mastafu Design](https://wordpress.org/support/users/mastafu/)
133
- * Added integration with "Goodlayers Core" on `gdlr_core_escape_content` filter.
134
 
135
  = 3.0.3 - 2022-09-02 =
136
  * Handle ACF integration if it is network activated plugin. Props for [maczek6000](https://profiles.wordpress.org/maczek6000/).
4
  Tags: sierotka, sierotki, spójniki, twarda spacja
5
  Requires at least: 4.6
6
  Tested up to: 6.1
7
+ Stable tag: 3.1.0
8
 
9
 
10
 
128
 
129
  == Changelog ==
130
 
131
+ = 3.1.0 - 2022-11-30=
132
+
133
+ * Changed simple replacement by DOMDocument parsing & replacing only on string.
134
+
135
+ = 3.0.4 - 2022-10-25 =
136
  * Handle space after year for short year format "r.". Props for [Mastafu Design](https://wordpress.org/support/users/mastafu/)
 
137
 
138
  = 3.0.3 - 2022-09-02 =
139
  * Handle ACF integration if it is network activated plugin. Props for [maczek6000](https://profiles.wordpress.org/maczek6000/).
sierotki.php CHANGED
@@ -5,7 +5,7 @@ Plugin URI: http://iworks.pl/2011/02/16/sierotki/
5
  Text Domain: sierotki
6
  Description: Implement Polish grammar rules with orphans.
7
  Author: Marcin Pietrzak
8
- Version: 3.0.4
9
  Author URI: http://iworks.pl/
10
  */
11
 
@@ -13,6 +13,10 @@ include_once dirname( __FILE__ ) . '/etc/options.php';
13
 
14
  load_plugin_textdomain( 'sierotki', false, dirname( __FILE__ ) . '/languages' );
15
 
 
 
 
 
16
  $includes = dirname( __FILE__ ) . '/includes';
17
 
18
  require_once $includes . '/iworks/class-iworks-orphan.php';
5
  Text Domain: sierotki
6
  Description: Implement Polish grammar rules with orphans.
7
  Author: Marcin Pietrzak
8
+ Version: 3.1.0
9
  Author URI: http://iworks.pl/
10
  */
11
 
13
 
14
  load_plugin_textdomain( 'sierotki', false, dirname( __FILE__ ) . '/languages' );
15
 
16
+ if ( ! defined( 'HDOM_TYPE_ELEMENT' ) ) {
17
+ require_once dirname( __FILE__ ) . '/vendor/simple_html_dom.php';
18
+ }
19
+
20
  $includes = dirname( __FILE__ ) . '/includes';
21
 
22
  require_once $includes . '/iworks/class-iworks-orphan.php';
vendor/index.php ADDED
@@ -0,0 +1,2 @@
 
 
1
+ <?php
2
+ exit;
vendor/simple_html_dom.php ADDED
@@ -0,0 +1,2353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ /**
3
+ * Website: http://sourceforge.net/projects/simplehtmldom/
4
+ * Additional projects: http://sourceforge.net/projects/debugobject/
5
+ * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6
+ *
7
+ * Licensed under The MIT License
8
+ * See the LICENSE file in the project root for more information.
9
+ *
10
+ * Authors:
11
+ * S.C. Chen
12
+ * John Schlick
13
+ * Rus Carroll
14
+ * logmanoriginal
15
+ *
16
+ * Contributors:
17
+ * Yousuke Kumakura
18
+ * Vadim Voituk
19
+ * Antcs
20
+ *
21
+ * Version Rev. 1.9.1 (291)
22
+ */
23
+
24
+ define('HDOM_TYPE_ELEMENT', 1);
25
+ define('HDOM_TYPE_COMMENT', 2);
26
+ define('HDOM_TYPE_TEXT', 3);
27
+ define('HDOM_TYPE_ENDTAG', 4);
28
+ define('HDOM_TYPE_ROOT', 5);
29
+ define('HDOM_TYPE_UNKNOWN', 6);
30
+ define('HDOM_QUOTE_DOUBLE', 0);
31
+ define('HDOM_QUOTE_SINGLE', 1);
32
+ define('HDOM_QUOTE_NO', 3);
33
+ define('HDOM_INFO_BEGIN', 0);
34
+ define('HDOM_INFO_END', 1);
35
+ define('HDOM_INFO_QUOTE', 2);
36
+ define('HDOM_INFO_SPACE', 3);
37
+ define('HDOM_INFO_TEXT', 4);
38
+ define('HDOM_INFO_INNER', 5);
39
+ define('HDOM_INFO_OUTER', 6);
40
+ define('HDOM_INFO_ENDSPACE', 7);
41
+
42
+ defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
43
+ defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
44
+ defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
45
+ defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
46
+ define('HDOM_SMARTY_AS_TEXT', 1);
47
+
48
+ function file_get_html(
49
+ $url,
50
+ $use_include_path = false,
51
+ $context = null,
52
+ $offset = 0,
53
+ $maxLen = -1,
54
+ $lowercase = true,
55
+ $forceTagsClosed = true,
56
+ $target_charset = DEFAULT_TARGET_CHARSET,
57
+ $stripRN = true,
58
+ $defaultBRText = DEFAULT_BR_TEXT,
59
+ $defaultSpanText = DEFAULT_SPAN_TEXT)
60
+ {
61
+ if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
62
+
63
+ $dom = new simple_html_dom(
64
+ null,
65
+ $lowercase,
66
+ $forceTagsClosed,
67
+ $target_charset,
68
+ $stripRN,
69
+ $defaultBRText,
70
+ $defaultSpanText
71
+ );
72
+
73
+ /**
74
+ * For sourceforge users: uncomment the next line and comment the
75
+ * retrieve_url_contents line 2 lines down if it is not already done.
76
+ */
77
+ $contents = file_get_contents(
78
+ $url,
79
+ $use_include_path,
80
+ $context,
81
+ $offset,
82
+ $maxLen
83
+ );
84
+ // $contents = retrieve_url_contents($url);
85
+
86
+ if (empty($contents) || strlen($contents) > $maxLen) {
87
+ $dom->clear();
88
+ return false;
89
+ }
90
+
91
+ return $dom->load($contents, $lowercase, $stripRN);
92
+ }
93
+
94
+ function str_get_html(
95
+ $str,
96
+ $lowercase = true,
97
+ $forceTagsClosed = true,
98
+ $target_charset = DEFAULT_TARGET_CHARSET,
99
+ $stripRN = true,
100
+ $defaultBRText = DEFAULT_BR_TEXT,
101
+ $defaultSpanText = DEFAULT_SPAN_TEXT)
102
+ {
103
+ $dom = new simple_html_dom(
104
+ null,
105
+ $lowercase,
106
+ $forceTagsClosed,
107
+ $target_charset,
108
+ $stripRN,
109
+ $defaultBRText,
110
+ $defaultSpanText
111
+ );
112
+
113
+ if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
114
+ $dom->clear();
115
+ return false;
116
+ }
117
+
118
+ return $dom->load($str, $lowercase, $stripRN);
119
+ }
120
+
121
+ function dump_html_tree($node, $show_attr = true, $deep = 0)
122
+ {
123
+ $node->dump($node);
124
+ }
125
+
126
+ class simple_html_dom_node
127
+ {
128
+ public $nodetype = HDOM_TYPE_TEXT;
129
+ public $tag = 'text';
130
+ public $attr = array();
131
+ public $children = array();
132
+ public $nodes = array();
133
+ public $parent = null;
134
+ public $_ = array();
135
+ public $tag_start = 0;
136
+ private $dom = null;
137
+
138
+ function __construct($dom)
139
+ {
140
+ $this->dom = $dom;
141
+ $dom->nodes[] = $this;
142
+ }
143
+
144
+ function __destruct()
145
+ {
146
+ $this->clear();
147
+ }
148
+
149
+ function __toString()
150
+ {
151
+ return $this->outertext();
152
+ }
153
+
154
+ function clear()
155
+ {
156
+ $this->dom = null;
157
+ $this->nodes = null;
158
+ $this->parent = null;
159
+ $this->children = null;
160
+ }
161
+
162
+ function dump($show_attr = true, $depth = 0)
163
+ {
164
+ echo str_repeat("\t", $depth) . $this->tag;
165
+
166
+ if ($show_attr && count($this->attr) > 0) {
167
+ echo '(';
168
+ foreach ($this->attr as $k => $v) {
169
+ echo "[$k]=>\"$v\", ";
170
+ }
171
+ echo ')';
172
+ }
173
+
174
+ echo "\n";
175
+
176
+ if ($this->nodes) {
177
+ foreach ($this->nodes as $node) {
178
+ $node->dump($show_attr, $depth + 1);
179
+ }
180
+ }
181
+ }
182
+
183
+ function dump_node($echo = true)
184
+ {
185
+ $string = $this->tag;
186
+
187
+ if (count($this->attr) > 0) {
188
+ $string .= '(';
189
+ foreach ($this->attr as $k => $v) {
190
+ $string .= "[$k]=>\"$v\", ";
191
+ }
192
+ $string .= ')';
193
+ }
194
+
195
+ if (count($this->_) > 0) {
196
+ $string .= ' $_ (';
197
+ foreach ($this->_ as $k => $v) {
198
+ if (is_array($v)) {
199
+ $string .= "[$k]=>(";
200
+ foreach ($v as $k2 => $v2) {
201
+ $string .= "[$k2]=>\"$v2\", ";
202
+ }
203
+ $string .= ')';
204
+ } else {
205
+ $string .= "[$k]=>\"$v\", ";
206
+ }
207
+ }
208
+ $string .= ')';
209
+ }
210
+
211
+ if (isset($this->text)) {
212
+ $string .= " text: ({$this->text})";
213
+ }
214
+
215
+ $string .= ' HDOM_INNER_INFO: ';
216
+
217
+ if (isset($node->_[HDOM_INFO_INNER])) {
218
+ $string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
219
+ } else {
220
+ $string .= ' NULL ';
221
+ }
222
+
223
+ $string .= ' children: ' . count($this->children);
224
+ $string .= ' nodes: ' . count($this->nodes);
225
+ $string .= ' tag_start: ' . $this->tag_start;
226
+ $string .= "\n";
227
+
228
+ if ($echo) {
229
+ echo $string;
230
+ return;
231
+ } else {
232
+ return $string;
233
+ }
234
+ }
235
+
236
+ function parent($parent = null)
237
+ {
238
+ // I am SURE that this doesn't work properly.
239
+ // It fails to unset the current node from it's current parents nodes or
240
+ // children list first.
241
+ if ($parent !== null) {
242
+ $this->parent = $parent;
243
+ $this->parent->nodes[] = $this;
244
+ $this->parent->children[] = $this;
245
+ }
246
+
247
+ return $this->parent;
248
+ }
249
+
250
+ function has_child()
251
+ {
252
+ return !empty($this->children);
253
+ }
254
+
255
+ function children($idx = -1)
256
+ {
257
+ if ($idx === -1) {
258
+ return $this->children;
259
+ }
260
+
261
+ if (isset($this->children[$idx])) {
262
+ return $this->children[$idx];
263
+ }
264
+
265
+ return null;
266
+ }
267
+
268
+ function first_child()
269
+ {
270
+ if (count($this->children) > 0) {
271
+ return $this->children[0];
272
+ }
273
+ return null;
274
+ }
275
+
276
+ function last_child()
277
+ {
278
+ if (count($this->children) > 0) {
279
+ return end($this->children);
280
+ }
281
+ return null;
282
+ }
283
+
284
+ function next_sibling()
285
+ {
286
+ if ($this->parent === null) {
287
+ return null;
288
+ }
289
+
290
+ $idx = array_search($this, $this->parent->children, true);
291
+
292
+ if ($idx !== false && isset($this->parent->children[$idx + 1])) {
293
+ return $this->parent->children[$idx + 1];
294
+ }
295
+
296
+ return null;
297
+ }
298
+
299
+ function prev_sibling()
300
+ {
301
+ if ($this->parent === null) {
302
+ return null;
303
+ }
304
+
305
+ $idx = array_search($this, $this->parent->children, true);
306
+
307
+ if ($idx !== false && $idx > 0) {
308
+ return $this->parent->children[$idx - 1];
309
+ }
310
+
311
+ return null;
312
+ }
313
+
314
+ function find_ancestor_tag($tag)
315
+ {
316
+ global $debug_object;
317
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
318
+
319
+ if ($this->parent === null) {
320
+ return null;
321
+ }
322
+
323
+ $ancestor = $this->parent;
324
+
325
+ while (!is_null($ancestor)) {
326
+ if (is_object($debug_object)) {
327
+ $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
328
+ }
329
+
330
+ if ($ancestor->tag === $tag) {
331
+ break;
332
+ }
333
+
334
+ $ancestor = $ancestor->parent;
335
+ }
336
+
337
+ return $ancestor;
338
+ }
339
+
340
+ function innertext()
341
+ {
342
+ if (isset($this->_[HDOM_INFO_INNER])) {
343
+ return $this->_[HDOM_INFO_INNER];
344
+ }
345
+
346
+ if (isset($this->_[HDOM_INFO_TEXT])) {
347
+ return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
348
+ }
349
+
350
+ $ret = '';
351
+
352
+ foreach ($this->nodes as $n) {
353
+ $ret .= $n->outertext();
354
+ }
355
+
356
+ return $ret;
357
+ }
358
+
359
+ function outertext()
360
+ {
361
+ global $debug_object;
362
+
363
+ if (is_object($debug_object)) {
364
+ $text = '';
365
+
366
+ if ($this->tag === 'text') {
367
+ if (!empty($this->text)) {
368
+ $text = ' with text: ' . $this->text;
369
+ }
370
+ }
371
+
372
+ $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
373
+ }
374
+
375
+ if ($this->tag === 'root') {
376
+ return $this->innertext();
377
+ }
378
+
379
+ // todo: What is the use of this callback? Remove?
380
+ if ($this->dom && $this->dom->callback !== null) {
381
+ call_user_func_array($this->dom->callback, array($this));
382
+ }
383
+
384
+ if (isset($this->_[HDOM_INFO_OUTER])) {
385
+ return $this->_[HDOM_INFO_OUTER];
386
+ }
387
+
388
+ if (isset($this->_[HDOM_INFO_TEXT])) {
389
+ return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
390
+ }
391
+
392
+ $ret = '';
393
+
394
+ if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
395
+ $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
396
+ }
397
+
398
+ if (isset($this->_[HDOM_INFO_INNER])) {
399
+ // todo: <br> should either never have HDOM_INFO_INNER or always
400
+ if ($this->tag !== 'br') {
401
+ $ret .= $this->_[HDOM_INFO_INNER];
402
+ }
403
+ } elseif ($this->nodes) {
404
+ foreach ($this->nodes as $n) {
405
+ $ret .= $this->convert_text($n->outertext());
406
+ }
407
+ }
408
+
409
+ if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
410
+ $ret .= '</' . $this->tag . '>';
411
+ }
412
+
413
+ return $ret;
414
+ }
415
+
416
+ function text()
417
+ {
418
+ if (isset($this->_[HDOM_INFO_INNER])) {
419
+ return $this->_[HDOM_INFO_INNER];
420
+ }
421
+
422
+ switch ($this->nodetype) {
423
+ case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
424
+ case HDOM_TYPE_COMMENT: return '';
425
+ case HDOM_TYPE_UNKNOWN: return '';
426
+ }
427
+
428
+ if (strcasecmp($this->tag, 'script') === 0) { return ''; }
429
+ if (strcasecmp($this->tag, 'style') === 0) { return ''; }
430
+
431
+ $ret = '';
432
+
433
+ // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
434
+ // for some span tags, and some p tags) $this->nodes is set to NULL.
435
+ // NOTE: This indicates that there is a problem where it's set to NULL
436
+ // without a clear happening.
437
+ // WHY is this happening?
438
+ if (!is_null($this->nodes)) {
439
+ foreach ($this->nodes as $n) {
440
+ // Start paragraph after a blank line
441
+ if ($n->tag === 'p') {
442
+ $ret = trim($ret) . "\n\n";
443
+ }
444
+
445
+ $ret .= $this->convert_text($n->text());
446
+
447
+ // If this node is a span... add a space at the end of it so
448
+ // multiple spans don't run into each other. This is plaintext
449
+ // after all.
450
+ if ($n->tag === 'span') {
451
+ $ret .= $this->dom->default_span_text;
452
+ }
453
+ }
454
+ }
455
+ return $ret;
456
+ }
457
+
458
+ function xmltext()
459
+ {
460
+ $ret = $this->innertext();
461
+ $ret = str_ireplace('<![CDATA[', '', $ret);
462
+ $ret = str_replace(']]>', '', $ret);
463
+ return $ret;
464
+ }
465
+
466
+ function makeup()
467
+ {
468
+ // text, comment, unknown
469
+ if (isset($this->_[HDOM_INFO_TEXT])) {
470
+ return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
471
+ }
472
+
473
+ $ret = '<' . $this->tag;
474
+ $i = -1;
475
+
476
+ foreach ($this->attr as $key => $val) {
477
+ ++$i;
478
+
479
+ // skip removed attribute
480
+ if ($val === null || $val === false) { continue; }
481
+
482
+ $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
483
+
484
+ //no value attr: nowrap, checked selected...
485
+ if ($val === true) {
486
+ $ret .= $key;
487
+ } else {
488
+ switch ($this->_[HDOM_INFO_QUOTE][$i])
489
+ {
490
+ case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
491
+ case HDOM_QUOTE_SINGLE: $quote = '\''; break;
492
+ default: $quote = '';
493
+ }
494
+
495
+ $ret .= $key
496
+ . $this->_[HDOM_INFO_SPACE][$i][1]
497
+ . '='
498
+ . $this->_[HDOM_INFO_SPACE][$i][2]
499
+ . $quote
500
+ . $val
501
+ . $quote;
502
+ }
503
+ }
504
+
505
+ $ret = $this->dom->restore_noise($ret);
506
+ return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
507
+ }
508
+
509
+ function find($selector, $idx = null, $lowercase = false)
510
+ {
511
+ $selectors = $this->parse_selector($selector);
512
+ if (($count = count($selectors)) === 0) { return array(); }
513
+ $found_keys = array();
514
+
515
+ // find each selector
516
+ for ($c = 0; $c < $count; ++$c) {
517
+ // The change on the below line was documented on the sourceforge
518
+ // code tracker id 2788009
519
+ // used to be: if (($levle=count($selectors[0]))===0) return array();
520
+ if (($levle = count($selectors[$c])) === 0) { return array(); }
521
+ if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
522
+
523
+ $head = array($this->_[HDOM_INFO_BEGIN] => 1);
524
+ $cmd = ' '; // Combinator
525
+
526
+ // handle descendant selectors, no recursive!
527
+ for ($l = 0; $l < $levle; ++$l) {
528
+ $ret = array();
529
+
530
+ foreach ($head as $k => $v) {
531
+ $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
532
+ //PaperG - Pass this optional parameter on to the seek function.
533
+ $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
534
+ }
535
+
536
+ $head = $ret;
537
+ $cmd = $selectors[$c][$l][4]; // Next Combinator
538
+ }
539
+
540
+ foreach ($head as $k => $v) {
541
+ if (!isset($found_keys[$k])) {
542
+ $found_keys[$k] = 1;
543
+ }
544
+ }
545
+ }
546
+
547
+ // sort keys
548
+ ksort($found_keys);
549
+
550
+ $found = array();
551
+ foreach ($found_keys as $k => $v) {
552
+ $found[] = $this->dom->nodes[$k];
553
+ }
554
+
555
+ // return nth-element or array
556
+ if (is_null($idx)) { return $found; }
557
+ elseif ($idx < 0) { $idx = count($found) + $idx; }
558
+ return (isset($found[$idx])) ? $found[$idx] : null;
559
+ }
560
+
561
+ protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
562
+ {
563
+ global $debug_object;
564
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
565
+
566
+ list($tag, $id, $class, $attributes, $cmb) = $selector;
567
+ $nodes = array();
568
+
569
+ if ($parent_cmd === ' ') { // Descendant Combinator
570
+ // Find parent closing tag if the current element doesn't have a closing
571
+ // tag (i.e. void element)
572
+ $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
573
+ if ($end == 0) {
574
+ $parent = $this->parent;
575
+ while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
576
+ $end -= 1;
577
+ $parent = $parent->parent;
578
+ }
579
+ $end += $parent->_[HDOM_INFO_END];
580
+ }
581
+
582
+ // Get list of target nodes
583
+ $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
584
+ $nodes_count = $end - $nodes_start;
585
+ $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
586
+ } elseif ($parent_cmd === '>') { // Child Combinator
587
+ $nodes = $this->children;
588
+ } elseif ($parent_cmd === '+'
589
+ && $this->parent
590
+ && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
591
+ $index = array_search($this, $this->parent->children, true) + 1;
592
+ if ($index < count($this->parent->children))
593
+ $nodes[] = $this->parent->children[$index];
594
+ } elseif ($parent_cmd === '~'
595
+ && $this->parent
596
+ && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
597
+ $index = array_search($this, $this->parent->children, true);
598
+ $nodes = array_slice($this->parent->children, $index);
599
+ }
600
+
601
+ // Go throgh each element starting at this element until the end tag
602
+ // Note: If this element is a void tag, any previous void element is
603
+ // skipped.
604
+ foreach($nodes as $node) {
605
+ $pass = true;
606
+
607
+ // Skip root nodes
608
+ if(!$node->parent) {
609
+ $pass = false;
610
+ }
611
+
612
+ // Handle 'text' selector
613
+ if($pass && $tag === 'text' && $node->tag === 'text') {
614
+ $ret[array_search($node, $this->dom->nodes, true)] = 1;
615
+ unset($node);
616
+ continue;
617
+ }
618
+
619
+ // Skip if node isn't a child node (i.e. text nodes)
620
+ if($pass && !in_array($node, $node->parent->children, true)) {
621
+ $pass = false;
622
+ }
623
+
624
+ // Skip if tag doesn't match
625
+ if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
626
+ $pass = false;
627
+ }
628
+
629
+ // Skip if ID doesn't exist
630
+ if ($pass && $id !== '' && !isset($node->attr['id'])) {
631
+ $pass = false;
632
+ }
633
+
634
+ // Check if ID matches
635
+ if ($pass && $id !== '' && isset($node->attr['id'])) {
636
+ // Note: Only consider the first ID (as browsers do)
637
+ $node_id = explode(' ', trim($node->attr['id']))[0];
638
+
639
+ if($id !== $node_id) { $pass = false; }
640
+ }
641
+
642
+ // Check if all class(es) exist
643
+ if ($pass && $class !== '' && is_array($class) && !empty($class)) {
644
+ if (isset($node->attr['class'])) {
645
+ $node_classes = explode(' ', $node->attr['class']);
646
+
647
+ if ($lowercase) {
648
+ $node_classes = array_map('strtolower', $node_classes);
649
+ }
650
+
651
+ foreach($class as $c) {
652
+ if(!in_array($c, $node_classes)) {
653
+ $pass = false;
654
+ break;
655
+ }
656
+ }
657
+ } else {
658
+ $pass = false;
659
+ }
660
+ }
661
+
662
+ // Check attributes
663
+ if ($pass
664
+ && $attributes !== ''
665
+ && is_array($attributes)
666
+ && !empty($attributes)) {
667
+ foreach($attributes as $a) {
668
+ list (
669
+ $att_name,
670
+ $att_expr,
671
+ $att_val,
672
+ $att_inv,
673
+ $att_case_sensitivity
674
+ ) = $a;
675
+
676
+ // Handle indexing attributes (i.e. "[2]")
677
+ /**
678
+ * Note: This is not supported by the CSS Standard but adds
679
+ * the ability to select items compatible to XPath (i.e.
680
+ * the 3rd element within it's parent).
681
+ *
682
+ * Note: This doesn't conflict with the CSS Standard which
683
+ * doesn't work on numeric attributes anyway.
684
+ */
685
+ if (is_numeric($att_name)
686
+ && $att_expr === ''
687
+ && $att_val === '') {
688
+ $count = 0;
689
+
690
+ // Find index of current element in parent
691
+ foreach ($node->parent->children as $c) {
692
+ if ($c->tag === $node->tag) ++$count;
693
+ if ($c === $node) break;
694
+ }
695
+
696
+ // If this is the correct node, continue with next
697
+ // attribute
698
+ if ($count === (int)$att_name) continue;
699
+ }
700
+
701
+ // Check attribute availability
702
+ if ($att_inv) { // Attribute should NOT be set
703
+ if (isset($node->attr[$att_name])) {
704
+ $pass = false;
705
+ break;
706
+ }
707
+ } else { // Attribute should be set
708
+ // todo: "plaintext" is not a valid CSS selector!
709
+ if ($att_name !== 'plaintext'
710
+ && !isset($node->attr[$att_name])) {
711
+ $pass = false;
712
+ break;
713
+ }
714
+ }
715
+
716
+ // Continue with next attribute if expression isn't defined
717
+ if ($att_expr === '') continue;
718
+
719
+ // If they have told us that this is a "plaintext"
720
+ // search then we want the plaintext of the node - right?
721
+ // todo "plaintext" is not a valid CSS selector!
722
+ if ($att_name === 'plaintext') {
723
+ $nodeKeyValue = $node->text();
724
+ } else {
725
+ $nodeKeyValue = $node->attr[$att_name];
726
+ }
727
+
728
+ if (is_object($debug_object)) {
729
+ $debug_object->debug_log(2,
730
+ 'testing node: '
731
+ . $node->tag
732
+ . ' for attribute: '
733
+ . $att_name
734
+ . $att_expr
735
+ . $att_val
736
+ . ' where nodes value is: '
737
+ . $nodeKeyValue
738
+ );
739
+ }
740
+
741
+ // If lowercase is set, do a case insensitive test of
742
+ // the value of the selector.
743
+ if ($lowercase) {
744
+ $check = $this->match(
745
+ $att_expr,
746
+ strtolower($att_val),
747
+ strtolower($nodeKeyValue),
748
+ $att_case_sensitivity
749
+ );
750
+ } else {
751
+ $check = $this->match(
752
+ $att_expr,
753
+ $att_val,
754
+ $nodeKeyValue,
755
+ $att_case_sensitivity
756
+ );
757
+ }
758
+
759
+ if (is_object($debug_object)) {
760
+ $debug_object->debug_log(2,
761
+ 'after match: '
762
+ . ($check ? 'true' : 'false')
763
+ );
764
+ }
765
+
766
+ if (!$check) {
767
+ $pass = false;
768
+ break;
769
+ }
770
+ }
771
+ }
772
+
773
+ // Found a match. Add to list and clear node
774
+ if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
775
+ unset($node);
776
+ }
777
+ // It's passed by reference so this is actually what this function returns.
778
+ if (is_object($debug_object)) {
779
+ $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
780
+ }
781
+ }
782
+
783
+ protected function match($exp, $pattern, $value, $case_sensitivity)
784
+ {
785
+ global $debug_object;
786
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
787
+
788
+ if ($case_sensitivity === 'i') {
789
+ $pattern = strtolower($pattern);
790
+ $value = strtolower($value);
791
+ }
792
+
793
+ switch ($exp) {
794
+ case '=':
795
+ return ($value === $pattern);
796
+ case '!=':
797
+ return ($value !== $pattern);
798
+ case '^=':
799
+ return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
800
+ case '$=':
801
+ return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
802
+ case '*=':
803
+ return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
804
+ case '|=':
805
+ /**
806
+ * [att|=val]
807
+ *
808
+ * Represents an element with the att attribute, its value
809
+ * either being exactly "val" or beginning with "val"
810
+ * immediately followed by "-" (U+002D).
811
+ */
812
+ return strpos($value, $pattern) === 0;
813
+ case '~=':
814
+ /**
815
+ * [att~=val]
816
+ *
817
+ * Represents an element with the att attribute whose value is a
818
+ * whitespace-separated list of words, one of which is exactly
819
+ * "val". If "val" contains whitespace, it will never represent
820
+ * anything (since the words are separated by spaces). Also if
821
+ * "val" is the empty string, it will never represent anything.
822
+ */
823
+ return in_array($pattern, explode(' ', trim($value)), true);
824
+ }
825
+ return false;
826
+ }
827
+
828
+ protected function parse_selector($selector_string)
829
+ {
830
+ global $debug_object;
831
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
832
+
833
+ /**
834
+ * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
835
+ *
836
+ * Paperg: Add the colon to the attribute, so that it properly finds
837
+ * <tag attr:ibute="something" > like google does.
838
+ *
839
+ * Note: if you try to look at this attribute, you MUST use getAttribute
840
+ * since $dom->x:y will fail the php syntax check.
841
+ *
842
+ * Notice the \[ starting the attribute? and the @? following? This
843
+ * implies that an attribute can begin with an @ sign that is not
844
+ * captured. This implies that an html attribute specifier may start
845
+ * with an @ sign that is NOT captured by the expression. Farther study
846
+ * is required to determine of this should be documented or removed.
847
+ *
848
+ * Matches selectors in this order:
849
+ *
850
+ * [0] - full match
851
+ *
852
+ * [1] - tag name
853
+ * ([\w:\*-]*)
854
+ * Matches the tag name consisting of zero or more words, colons,
855
+ * asterisks and hyphens.
856
+ *
857
+ * [2] - id name
858
+ * (?:\#([\w-]+))
859
+ * Optionally matches a id name, consisting of an "#" followed by
860
+ * the id name (one or more words and hyphens).
861
+ *
862
+ * [3] - class names (including dots)
863
+ * (?:\.([\w\.-]+))?
864
+ * Optionally matches a list of classs, consisting of an "."
865
+ * followed by the class name (one or more words and hyphens)
866
+ * where multiple classes can be chained (i.e. ".foo.bar.baz")
867
+ *
868
+ * [4] - attributes
869
+ * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
870
+ * Optionally matches the attributes list
871
+ *
872
+ * [5] - separator
873
+ * ([\/, >+~]+)
874
+ * Matches the selector list separator
875
+ */
876
+ // phpcs:ignore Generic.Files.LineLength
877
+ $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
878
+
879
+ preg_match_all(
880
+ $pattern,
881
+ trim($selector_string) . ' ', // Add final ' ' as pseudo separator
882
+ $matches,
883
+ PREG_SET_ORDER
884
+ );
885
+
886
+ if (is_object($debug_object)) {
887
+ $debug_object->debug_log(2, 'Matches Array: ', $matches);
888
+ }
889
+
890
+ $selectors = array();
891
+ $result = array();
892
+
893
+ foreach ($matches as $m) {
894
+ $m[0] = trim($m[0]);
895
+
896
+ // Skip NoOps
897
+ if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
898
+
899
+ // Convert to lowercase
900
+ if ($this->dom->lowercase) {
901
+ $m[1] = strtolower($m[1]);
902
+ }
903
+
904
+ // Extract classes
905
+ if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
906
+
907
+ /* Extract attributes (pattern based on the pattern above!)
908
+
909
+ * [0] - full match
910
+ * [1] - attribute name
911
+ * [2] - attribute expression
912
+ * [3] - attribute value
913
+ * [4] - case sensitivity
914
+ *
915
+ * Note: Attributes can be negated with a "!" prefix to their name
916
+ */
917
+ if($m[4] !== '') {
918
+ preg_match_all(
919
+ "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
920
+ trim($m[4]),
921
+ $attributes,
922
+ PREG_SET_ORDER
923
+ );
924
+
925
+ // Replace element by array
926
+ $m[4] = array();
927
+
928
+ foreach($attributes as $att) {
929
+ // Skip empty matches
930
+ if(trim($att[0]) === '') { continue; }
931
+
932
+ $inverted = (isset($att[1][0]) && $att[1][0] === '!');
933
+ $m[4][] = array(
934
+ $inverted ? substr($att[1], 1) : $att[1], // Name
935
+ (isset($att[2])) ? $att[2] : '', // Expression
936
+ (isset($att[3])) ? $att[3] : '', // Value
937
+ $inverted, // Inverted Flag
938
+ (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
939
+ );
940
+ }
941
+ }
942
+
943
+ // Sanitize Separator
944
+ if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
945
+ $m[5] = ' ';
946
+ } else { // Other Separator
947
+ $m[5] = trim($m[5]);
948
+ }
949
+
950
+ // Clear Separator if it's a Selector List
951
+ if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
952
+
953
+ // Remove full match before adding to results
954
+ array_shift($m);
955
+ $result[] = $m;
956
+
957
+ if ($is_list) { // Selector List
958
+ $selectors[] = $result;
959
+ $result = array();
960
+ }
961
+ }
962
+
963
+ if (count($result) > 0) { $selectors[] = $result; }
964
+ return $selectors;
965
+ }
966
+
967
+ function __get($name)
968
+ {
969
+ if (isset($this->attr[$name])) {
970
+ return $this->convert_text($this->attr[$name]);
971
+ }
972
+ switch ($name) {
973
+ case 'outertext': return $this->outertext();
974
+ case 'innertext': return $this->innertext();
975
+ case 'plaintext': return $this->text();
976
+ case 'xmltext': return $this->xmltext();
977
+ default: return array_key_exists($name, $this->attr);
978
+ }
979
+ }
980
+
981
+ function __set($name, $value)
982
+ {
983
+ global $debug_object;
984
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
985
+
986
+ switch ($name) {
987
+ case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
988
+ case 'innertext':
989
+ if (isset($this->_[HDOM_INFO_TEXT])) {
990
+ return $this->_[HDOM_INFO_TEXT] = $value;
991
+ }
992
+ return $this->_[HDOM_INFO_INNER] = $value;
993
+ }
994
+
995
+ if (!isset($this->attr[$name])) {
996
+ $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
997
+ $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
998
+ }
999
+
1000
+ $this->attr[$name] = $value;
1001
+ }
1002
+
1003
+ function __isset($name)
1004
+ {
1005
+ switch ($name) {
1006
+ case 'outertext': return true;
1007
+ case 'innertext': return true;
1008
+ case 'plaintext': return true;
1009
+ }
1010
+ //no value attr: nowrap, checked selected...
1011
+ return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1012
+ }
1013
+
1014
+ function __unset($name)
1015
+ {
1016
+ if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1017
+ }
1018
+
1019
+ function convert_text($text)
1020
+ {
1021
+ global $debug_object;
1022
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1023
+
1024
+ $converted_text = $text;
1025
+
1026
+ $sourceCharset = '';
1027
+ $targetCharset = '';
1028
+
1029
+ if ($this->dom) {
1030
+ $sourceCharset = strtoupper($this->dom->_charset);
1031
+ $targetCharset = strtoupper($this->dom->_target_charset);
1032
+ }
1033
+
1034
+ if (is_object($debug_object)) {
1035
+ $debug_object->debug_log(3,
1036
+ 'source charset: '
1037
+ . $sourceCharset
1038
+ . ' target charaset: '
1039
+ . $targetCharset
1040
+ );
1041
+ }
1042
+
1043
+ if (!empty($sourceCharset)
1044
+ && !empty($targetCharset)
1045
+ && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1046
+ // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1047
+ if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1048
+ && ($this->is_utf8($text))) {
1049
+ $converted_text = $text;
1050
+ } else {
1051
+ $converted_text = iconv($sourceCharset, $targetCharset, $text);
1052
+ }
1053
+ }
1054
+
1055
+ // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1056
+ if ($targetCharset === 'UTF-8') {
1057
+ if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1058
+ $converted_text = substr($converted_text, 3);
1059
+ }
1060
+
1061
+ if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1062
+ $converted_text = substr($converted_text, 0, -3);
1063
+ }
1064
+ }
1065
+
1066
+ return $converted_text;
1067
+ }
1068
+
1069
+ static function is_utf8($str)
1070
+ {
1071
+ $c = 0; $b = 0;
1072
+ $bits = 0;
1073
+ $len = strlen($str);
1074
+ for($i = 0; $i < $len; $i++) {
1075
+ $c = ord($str[$i]);
1076
+ if($c > 128) {
1077
+ if(($c >= 254)) { return false; }
1078
+ elseif($c >= 252) { $bits = 6; }
1079
+ elseif($c >= 248) { $bits = 5; }
1080
+ elseif($c >= 240) { $bits = 4; }
1081
+ elseif($c >= 224) { $bits = 3; }
1082
+ elseif($c >= 192) { $bits = 2; }
1083
+ else { return false; }
1084
+ if(($i + $bits) > $len) { return false; }
1085
+ while($bits > 1) {
1086
+ $i++;
1087
+ $b = ord($str[$i]);
1088
+ if($b < 128 || $b > 191) { return false; }
1089
+ $bits--;
1090
+ }
1091
+ }
1092
+ }
1093
+ return true;
1094
+ }
1095
+
1096
+ function get_display_size()
1097
+ {
1098
+ global $debug_object;
1099
+
1100
+ $width = -1;
1101
+ $height = -1;
1102
+
1103
+ if ($this->tag !== 'img') {
1104
+ return false;
1105
+ }
1106
+
1107
+ // See if there is aheight or width attribute in the tag itself.
1108
+ if (isset($this->attr['width'])) {
1109
+ $width = $this->attr['width'];
1110
+ }
1111
+
1112
+ if (isset($this->attr['height'])) {
1113
+ $height = $this->attr['height'];
1114
+ }
1115
+
1116
+ // Now look for an inline style.
1117
+ if (isset($this->attr['style'])) {
1118
+ // Thanks to user gnarf from stackoverflow for this regular expression.
1119
+ $attributes = array();
1120
+
1121
+ preg_match_all(
1122
+ '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1123
+ $this->attr['style'],
1124
+ $matches,
1125
+ PREG_SET_ORDER
1126
+ );
1127
+
1128
+ foreach ($matches as $match) {
1129
+ $attributes[$match[1]] = $match[2];
1130
+ }
1131
+
1132
+ // If there is a width in the style attributes:
1133
+ if (isset($attributes['width']) && $width == -1) {
1134
+ // check that the last two characters are px (pixels)
1135
+ if (strtolower(substr($attributes['width'], -2)) === 'px') {
1136
+ $proposed_width = substr($attributes['width'], 0, -2);
1137
+ // Now make sure that it's an integer and not something stupid.
1138
+ if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1139
+ $width = $proposed_width;
1140
+ }
1141
+ }
1142
+ }
1143
+
1144
+ // If there is a width in the style attributes:
1145
+ if (isset($attributes['height']) && $height == -1) {
1146
+ // check that the last two characters are px (pixels)
1147
+ if (strtolower(substr($attributes['height'], -2)) == 'px') {
1148
+ $proposed_height = substr($attributes['height'], 0, -2);
1149
+ // Now make sure that it's an integer and not something stupid.
1150
+ if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1151
+ $height = $proposed_height;
1152
+ }
1153
+ }
1154
+ }
1155
+
1156
+ }
1157
+
1158
+ // Future enhancement:
1159
+ // Look in the tag to see if there is a class or id specified that has
1160
+ // a height or width attribute to it.
1161
+
1162
+ // Far future enhancement
1163
+ // Look at all the parent tags of this image to see if they specify a
1164
+ // class or id that has an img selector that specifies a height or width
1165
+ // Note that in this case, the class or id will have the img subselector
1166
+ // for it to apply to the image.
1167
+
1168
+ // ridiculously far future development
1169
+ // If the class or id is specified in a SEPARATE css file thats not on
1170
+ // the page, go get it and do what we were just doing for the ones on
1171
+ // the page.
1172
+
1173
+ $result = array(
1174
+ 'height' => $height,
1175
+ 'width' => $width
1176
+ );
1177
+
1178
+ return $result;
1179
+ }
1180
+
1181
+ function save($filepath = '')
1182
+ {
1183
+ $ret = $this->outertext();
1184
+
1185
+ if ($filepath !== '') {
1186
+ file_put_contents($filepath, $ret, LOCK_EX);
1187
+ }
1188
+
1189
+ return $ret;
1190
+ }
1191
+
1192
+ function addClass($class)
1193
+ {
1194
+ if (is_string($class)) {
1195
+ $class = explode(' ', $class);
1196
+ }
1197
+
1198
+ if (is_array($class)) {
1199
+ foreach($class as $c) {
1200
+ if (isset($this->class)) {
1201
+ if ($this->hasClass($c)) {
1202
+ continue;
1203
+ } else {
1204
+ $this->class .= ' ' . $c;
1205
+ }
1206
+ } else {
1207
+ $this->class = $c;
1208
+ }
1209
+ }
1210
+ } else {
1211
+ if (is_object($debug_object)) {
1212
+ $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1213
+ }
1214
+ }
1215
+ }
1216
+
1217
+ function hasClass($class)
1218
+ {
1219
+ if (is_string($class)) {
1220
+ if (isset($this->class)) {
1221
+ return in_array($class, explode(' ', $this->class), true);
1222
+ }
1223
+ } else {
1224
+ if (is_object($debug_object)) {
1225
+ $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1226
+ }
1227
+ }
1228
+
1229
+ return false;
1230
+ }
1231
+
1232
+ function removeClass($class = null)
1233
+ {
1234
+ if (!isset($this->class)) {
1235
+ return;
1236
+ }
1237
+
1238
+ if (is_null($class)) {
1239
+ $this->removeAttribute('class');
1240
+ return;
1241
+ }
1242
+
1243
+ if (is_string($class)) {
1244
+ $class = explode(' ', $class);
1245
+ }
1246
+
1247
+ if (is_array($class)) {
1248
+ $class = array_diff(explode(' ', $this->class), $class);
1249
+ if (empty($class)) {
1250
+ $this->removeAttribute('class');
1251
+ } else {
1252
+ $this->class = implode(' ', $class);
1253
+ }
1254
+ }
1255
+ }
1256
+
1257
+ function getAllAttributes()
1258
+ {
1259
+ return $this->attr;
1260
+ }
1261
+
1262
+ function getAttribute($name)
1263
+ {
1264
+ return $this->__get($name);
1265
+ }
1266
+
1267
+ function setAttribute($name, $value)
1268
+ {
1269
+ $this->__set($name, $value);
1270
+ }
1271
+
1272
+ function hasAttribute($name)
1273
+ {
1274
+ return $this->__isset($name);
1275
+ }
1276
+
1277
+ function removeAttribute($name)
1278
+ {
1279
+ $this->__set($name, null);
1280
+ }
1281
+
1282
+ function remove()
1283
+ {
1284
+ if ($this->parent) {
1285
+ $this->parent->removeChild($this);
1286
+ }
1287
+ }
1288
+
1289
+ function removeChild($node)
1290
+ {
1291
+ $nidx = array_search($node, $this->nodes, true);
1292
+ $cidx = array_search($node, $this->children, true);
1293
+ $didx = array_search($node, $this->dom->nodes, true);
1294
+
1295
+ if ($nidx !== false && $cidx !== false && $didx !== false) {
1296
+
1297
+ foreach($node->children as $child) {
1298
+ $node->removeChild($child);
1299
+ }
1300
+
1301
+ foreach($node->nodes as $entity) {
1302
+ $enidx = array_search($entity, $node->nodes, true);
1303
+ $edidx = array_search($entity, $node->dom->nodes, true);
1304
+
1305
+ if ($enidx !== false && $edidx !== false) {
1306
+ unset($node->nodes[$enidx]);
1307
+ unset($node->dom->nodes[$edidx]);
1308
+ }
1309
+ }
1310
+
1311
+ unset($this->nodes[$nidx]);
1312
+ unset($this->children[$cidx]);
1313
+ unset($this->dom->nodes[$didx]);
1314
+
1315
+ $node->clear();
1316
+
1317
+ }
1318
+ }
1319
+
1320
+ function getElementById($id)
1321
+ {
1322
+ return $this->find("#$id", 0);
1323
+ }
1324
+
1325
+ function getElementsById($id, $idx = null)
1326
+ {
1327
+ return $this->find("#$id", $idx);
1328
+ }
1329
+
1330
+ function getElementByTagName($name)
1331
+ {
1332
+ return $this->find($name, 0);
1333
+ }
1334
+
1335
+ function getElementsByTagName($name, $idx = null)
1336
+ {
1337
+ return $this->find($name, $idx);
1338
+ }
1339
+
1340
+ function parentNode()
1341
+ {
1342
+ return $this->parent();
1343
+ }
1344
+
1345
+ function childNodes($idx = -1)
1346
+ {
1347
+ return $this->children($idx);
1348
+ }
1349
+
1350
+ function firstChild()
1351
+ {
1352
+ return $this->first_child();
1353
+ }
1354
+
1355
+ function lastChild()
1356
+ {
1357
+ return $this->last_child();
1358
+ }
1359
+
1360
+ function nextSibling()
1361
+ {
1362
+ return $this->next_sibling();
1363
+ }
1364
+
1365
+ function previousSibling()
1366
+ {
1367
+ return $this->prev_sibling();
1368
+ }
1369
+
1370
+ function hasChildNodes()
1371
+ {
1372
+ return $this->has_child();
1373
+ }
1374
+
1375
+ function nodeName()
1376
+ {
1377
+ return $this->tag;
1378
+ }
1379
+
1380
+ function appendChild($node)
1381
+ {
1382
+ $node->parent($this);
1383
+ return $node;
1384
+ }
1385
+
1386
+ }
1387
+
1388
+ class simple_html_dom
1389
+ {
1390
+ public $root = null;
1391
+ public $nodes = array();
1392
+ public $callback = null;
1393
+ public $lowercase = false;
1394
+ public $original_size;
1395
+ public $size;
1396
+
1397
+ protected $pos;
1398
+ protected $doc;
1399
+ protected $char;
1400
+
1401
+ protected $cursor;
1402
+ protected $parent;
1403
+ protected $noise = array();
1404
+ protected $token_blank = " \t\r\n";
1405
+ protected $token_equal = ' =/>';
1406
+ protected $token_slash = " />\r\n\t";
1407
+ protected $token_attr = ' >';
1408
+
1409
+ public $_charset = '';
1410
+ public $_target_charset = '';
1411
+
1412
+ protected $default_br_text = '';
1413
+
1414
+ public $default_span_text = '';
1415
+
1416
+ protected $self_closing_tags = array(
1417
+ 'area' => 1,
1418
+ 'base' => 1,
1419
+ 'br' => 1,
1420
+ 'col' => 1,
1421
+ 'embed' => 1,
1422
+ 'hr' => 1,
1423
+ 'img' => 1,
1424
+ 'input' => 1,
1425
+ 'link' => 1,
1426
+ 'meta' => 1,
1427
+ 'param' => 1,
1428
+ 'source' => 1,
1429
+ 'track' => 1,
1430
+ 'wbr' => 1
1431
+ );
1432
+ protected $block_tags = array(
1433
+ 'body' => 1,
1434
+ 'div' => 1,
1435
+ 'form' => 1,
1436
+ 'root' => 1,
1437
+ 'span' => 1,
1438
+ 'table' => 1
1439
+ );
1440
+ protected $optional_closing_tags = array(
1441
+ // Not optional, see
1442
+ // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1443
+ 'b' => array('b' => 1),
1444
+ 'dd' => array('dd' => 1, 'dt' => 1),
1445
+ // Not optional, see
1446
+ // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1447
+ 'dl' => array('dd' => 1, 'dt' => 1),
1448
+ 'dt' => array('dd' => 1, 'dt' => 1),
1449
+ 'li' => array('li' => 1),
1450
+ 'optgroup' => array('optgroup' => 1, 'option' => 1),
1451
+ 'option' => array('optgroup' => 1, 'option' => 1),
1452
+ 'p' => array('p' => 1),
1453
+ 'rp' => array('rp' => 1, 'rt' => 1),
1454
+ 'rt' => array('rp' => 1, 'rt' => 1),
1455
+ 'td' => array('td' => 1, 'th' => 1),
1456
+ 'th' => array('td' => 1, 'th' => 1),
1457
+ 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1458
+ );
1459
+
1460
+ function __construct(
1461
+ $str = null,
1462
+ $lowercase = true,
1463
+ $forceTagsClosed = true,
1464
+ $target_charset = DEFAULT_TARGET_CHARSET,
1465
+ $stripRN = true,
1466
+ $defaultBRText = DEFAULT_BR_TEXT,
1467
+ $defaultSpanText = DEFAULT_SPAN_TEXT,
1468
+ $options = 0)
1469
+ {
1470
+ if ($str) {
1471
+ if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1472
+ $this->load_file($str);
1473
+ } else {
1474
+ $this->load(
1475
+ $str,
1476
+ $lowercase,
1477
+ $stripRN,
1478
+ $defaultBRText,
1479
+ $defaultSpanText,
1480
+ $options
1481
+ );
1482
+ }
1483
+ }
1484
+ // Forcing tags to be closed implies that we don't trust the html, but
1485
+ // it can lead to parsing errors if we SHOULD trust the html.
1486
+ if (!$forceTagsClosed) {
1487
+ $this->optional_closing_array = array();
1488
+ }
1489
+
1490
+ $this->_target_charset = $target_charset;
1491
+ }
1492
+
1493
+ function __destruct()
1494
+ {
1495
+ $this->clear();
1496
+ }
1497
+
1498
+ function load(
1499
+ $str,
1500
+ $lowercase = true,
1501
+ $stripRN = true,
1502
+ $defaultBRText = DEFAULT_BR_TEXT,
1503
+ $defaultSpanText = DEFAULT_SPAN_TEXT,
1504
+ $options = 0)
1505
+ {
1506
+ global $debug_object;
1507
+
1508
+ // prepare
1509
+ $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1510
+
1511
+ // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1512
+ // Script tags removal now preceeds style tag removal.
1513
+ // strip out <script> tags
1514
+ $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1515
+ $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1516
+
1517
+ // strip out the \r \n's if we are told to.
1518
+ if ($stripRN) {
1519
+ $this->doc = str_replace("\r", ' ', $this->doc);
1520
+ $this->doc = str_replace("\n", ' ', $this->doc);
1521
+
1522
+ // set the length of content since we have changed it.
1523
+ $this->size = strlen($this->doc);
1524
+ }
1525
+
1526
+ // strip out cdata
1527
+ $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1528
+ // strip out comments
1529
+ $this->remove_noise("'<!--(.*?)-->'is");
1530
+ // strip out <style> tags
1531
+ $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1532
+ $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1533
+ // strip out preformatted tags
1534
+ $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1535
+ // strip out server side scripts
1536
+ $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1537
+
1538
+ if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1539
+ $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1540
+ }
1541
+
1542
+ // parsing
1543
+ $this->parse();
1544
+ // end
1545
+ $this->root->_[HDOM_INFO_END] = $this->cursor;
1546
+ $this->parse_charset();
1547
+
1548
+ // make load function chainable
1549
+ return $this;
1550
+ }
1551
+
1552
+ function load_file()
1553
+ {
1554
+ $args = func_get_args();
1555
+
1556
+ if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1557
+ $this->load($doc, true);
1558
+ } else {
1559
+ return false;
1560
+ }
1561
+ }
1562
+
1563
+ function set_callback($function_name)
1564
+ {
1565
+ $this->callback = $function_name;
1566
+ }
1567
+
1568
+ function remove_callback()
1569
+ {
1570
+ $this->callback = null;
1571
+ }
1572
+
1573
+ function save($filepath = '')
1574
+ {
1575
+ $ret = $this->root->innertext();
1576
+ if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1577
+ return $ret;
1578
+ }
1579
+
1580
+ function find($selector, $idx = null, $lowercase = false)
1581
+ {
1582
+ return $this->root->find($selector, $idx, $lowercase);
1583
+ }
1584
+
1585
+ function clear()
1586
+ {
1587
+ if (isset($this->nodes)) {
1588
+ foreach ($this->nodes as $n) {
1589
+ $n->clear();
1590
+ $n = null;
1591
+ }
1592
+ }
1593
+
1594
+ // This add next line is documented in the sourceforge repository.
1595
+ // 2977248 as a fix for ongoing memory leaks that occur even with the
1596
+ // use of clear.
1597
+ if (isset($this->children)) {
1598
+ foreach ($this->children as $n) {
1599
+ $n->clear();
1600
+ $n = null;
1601
+ }
1602
+ }
1603
+
1604
+ if (isset($this->parent)) {
1605
+ $this->parent->clear();
1606
+ unset($this->parent);
1607
+ }
1608
+
1609
+ if (isset($this->root)) {
1610
+ $this->root->clear();
1611
+ unset($this->root);
1612
+ }
1613
+
1614
+ unset($this->doc);
1615
+ unset($this->noise);
1616
+ }
1617
+
1618
+ function dump($show_attr = true)
1619
+ {
1620
+ $this->root->dump($show_attr);
1621
+ }
1622
+
1623
+ protected function prepare(
1624
+ $str, $lowercase = true,
1625
+ $defaultBRText = DEFAULT_BR_TEXT,
1626
+ $defaultSpanText = DEFAULT_SPAN_TEXT)
1627
+ {
1628
+ $this->clear();
1629
+
1630
+ $this->doc = trim($str);
1631
+ $this->size = strlen($this->doc);
1632
+ $this->original_size = $this->size; // original size of the html
1633
+ $this->pos = 0;
1634
+ $this->cursor = 1;
1635
+ $this->noise = array();
1636
+ $this->nodes = array();
1637
+ $this->lowercase = $lowercase;
1638
+ $this->default_br_text = $defaultBRText;
1639
+ $this->default_span_text = $defaultSpanText;
1640
+ $this->root = new simple_html_dom_node($this);
1641
+ $this->root->tag = 'root';
1642
+ $this->root->_[HDOM_INFO_BEGIN] = -1;
1643
+ $this->root->nodetype = HDOM_TYPE_ROOT;
1644
+ $this->parent = $this->root;
1645
+ if ($this->size > 0) { $this->char = $this->doc[0]; }
1646
+ }
1647
+
1648
+ protected function parse()
1649
+ {
1650
+ while (true) {
1651
+ // Read next tag if there is no text between current position and the
1652
+ // next opening tag.
1653
+ if (($s = $this->copy_until_char('<')) === '') {
1654
+ if($this->read_tag()) {
1655
+ continue;
1656
+ } else {
1657
+ return true;
1658
+ }
1659
+ }
1660
+
1661
+ // Add a text node for text between tags
1662
+ $node = new simple_html_dom_node($this);
1663
+ ++$this->cursor;
1664
+ $node->_[HDOM_INFO_TEXT] = $s;
1665
+ $this->link_nodes($node, false);
1666
+ }
1667
+ }
1668
+
1669
+ protected function parse_charset()
1670
+ {
1671
+ global $debug_object;
1672
+
1673
+ $charset = null;
1674
+
1675
+ if (function_exists('get_last_retrieve_url_contents_content_type')) {
1676
+ $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1677
+ $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1678
+ if ($success) {
1679
+ $charset = $matches[1];
1680
+ if (is_object($debug_object)) {
1681
+ $debug_object->debug_log(2,
1682
+ 'header content-type found charset of: '
1683
+ . $charset
1684
+ );
1685
+ }
1686
+ }
1687
+ }
1688
+
1689
+ if (empty($charset)) {
1690
+ // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
1691
+ $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
1692
+
1693
+ if (!empty($el)) {
1694
+ $fullvalue = $el->content;
1695
+ if (is_object($debug_object)) {
1696
+ $debug_object->debug_log(2,
1697
+ 'meta content-type tag found'
1698
+ . $fullvalue
1699
+ );
1700
+ }
1701
+
1702
+ if (!empty($fullvalue)) {
1703
+ $success = preg_match(
1704
+ '/charset=(.+)/i',
1705
+ $fullvalue,
1706
+ $matches
1707
+ );
1708
+
1709
+ if ($success) {
1710
+ $charset = $matches[1];
1711
+ } else {
1712
+ // If there is a meta tag, and they don't specify the
1713
+ // character set, research says that it's typically
1714
+ // ISO-8859-1
1715
+ if (is_object($debug_object)) {
1716
+ $debug_object->debug_log(2,
1717
+ 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
1718
+ );
1719
+ }
1720
+
1721
+ $charset = 'ISO-8859-1';
1722
+ }
1723
+ }
1724
+ }
1725
+ }
1726
+
1727
+ if (empty($charset)) {
1728
+ // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
1729
+ if ($meta = $this->root->find('meta[charset]', 0)) {
1730
+ $charset = $meta->charset;
1731
+ if (is_object($debug_object)) {
1732
+ $debug_object->debug_log(2, 'meta charset: ' . $charset);
1733
+ }
1734
+ }
1735
+ }
1736
+
1737
+ if (empty($charset)) {
1738
+ // Try to guess the charset based on the content
1739
+ // Requires Multibyte String (mbstring) support (optional)
1740
+ if (function_exists('mb_detect_encoding')) {
1741
+ /**
1742
+ * mb_detect_encoding() is not intended to distinguish between
1743
+ * charsets, especially single-byte charsets. Its primary
1744
+ * purpose is to detect which multibyte encoding is in use,
1745
+ * i.e. UTF-8, UTF-16, shift-JIS, etc.
1746
+ *
1747
+ * -- https://bugs.php.net/bug.php?id=38138
1748
+ *
1749
+ * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
1750
+ * always result in CP1251/ISO-8859-5 and vice versa.
1751
+ *
1752
+ * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
1753
+ * to stay compatible.
1754
+ */
1755
+ $encoding = mb_detect_encoding(
1756
+ $this->doc,
1757
+ array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
1758
+ );
1759
+
1760
+ if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
1761
+ // Due to a limitation of mb_detect_encoding
1762
+ // 'CP1251'/'ISO-8859-5' will be detected as
1763
+ // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
1764
+ // which case we can simply assume it is the other charset.
1765
+ if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
1766
+ $encoding = 'CP1251';
1767
+ }
1768
+ }
1769
+
1770
+ if ($encoding !== false) {
1771
+ $charset = $encoding;
1772
+ if (is_object($debug_object)) {
1773
+ $debug_object->debug_log(2, 'mb_detect: ' . $charset);
1774
+ }
1775
+ }
1776
+ }
1777
+ }
1778
+
1779
+ if (empty($charset)) {
1780
+ // Assume it's UTF-8 as it is the most likely charset to be used
1781
+ $charset = 'UTF-8';
1782
+ if (is_object($debug_object)) {
1783
+ $debug_object->debug_log(2, 'No match found, assume ' . $charset);
1784
+ }
1785
+ }
1786
+
1787
+ // Since CP1252 is a superset, if we get one of it's subsets, we want
1788
+ // it instead.
1789
+ if ((strtolower($charset) == 'iso-8859-1')
1790
+ || (strtolower($charset) == 'latin1')
1791
+ || (strtolower($charset) == 'latin-1')) {
1792
+ $charset = 'CP1252';
1793
+ if (is_object($debug_object)) {
1794
+ $debug_object->debug_log(2,
1795
+ 'replacing ' . $charset . ' with CP1252 as its a superset'
1796
+ );
1797
+ }
1798
+ }
1799
+
1800
+ if (is_object($debug_object)) {
1801
+ $debug_object->debug_log(1, 'EXIT - ' . $charset);
1802
+ }
1803
+
1804
+ return $this->_charset = $charset;
1805
+ }
1806
+
1807
+ protected function read_tag()
1808
+ {
1809
+ // Set end position if no further tags found
1810
+ if ($this->char !== '<') {
1811
+ $this->root->_[HDOM_INFO_END] = $this->cursor;
1812
+ return false;
1813
+ }
1814
+
1815
+ $begin_tag_pos = $this->pos;
1816
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1817
+
1818
+ // end tag
1819
+ if ($this->char === '/') {
1820
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1821
+
1822
+ // Skip whitespace in end tags (i.e. in "</ html>")
1823
+ $this->skip($this->token_blank);
1824
+ $tag = $this->copy_until_char('>');
1825
+
1826
+ // Skip attributes in end tags
1827
+ if (($pos = strpos($tag, ' ')) !== false) {
1828
+ $tag = substr($tag, 0, $pos);
1829
+ }
1830
+
1831
+ $parent_lower = strtolower($this->parent->tag);
1832
+ $tag_lower = strtolower($tag);
1833
+
1834
+ // The end tag is supposed to close the parent tag. Handle situations
1835
+ // when it doesn't
1836
+ if ($parent_lower !== $tag_lower) {
1837
+ // Parent tag does not have to be closed necessarily (optional closing tag)
1838
+ // Current tag is a block tag, so it may close an ancestor
1839
+ if (isset($this->optional_closing_tags[$parent_lower])
1840
+ && isset($this->block_tags[$tag_lower])) {
1841
+
1842
+ $this->parent->_[HDOM_INFO_END] = 0;
1843
+ $org_parent = $this->parent;
1844
+
1845
+ // Traverse ancestors to find a matching opening tag
1846
+ // Stop at root node
1847
+ while (($this->parent->parent)
1848
+ && strtolower($this->parent->tag) !== $tag_lower
1849
+ ){
1850
+ $this->parent = $this->parent->parent;
1851
+ }
1852
+
1853
+ // If we don't have a match add current tag as text node
1854
+ if (strtolower($this->parent->tag) !== $tag_lower) {
1855
+ $this->parent = $org_parent; // restore origonal parent
1856
+
1857
+ if ($this->parent->parent) {
1858
+ $this->parent = $this->parent->parent;
1859
+ }
1860
+
1861
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1862
+ return $this->as_text_node($tag);
1863
+ }
1864
+ } elseif (($this->parent->parent)
1865
+ && isset($this->block_tags[$tag_lower])
1866
+ ) {
1867
+ // Grandparent exists and current tag is a block tag, so our
1868
+ // parent doesn't have an end tag
1869
+ $this->parent->_[HDOM_INFO_END] = 0; // No end tag
1870
+ $org_parent = $this->parent;
1871
+
1872
+ // Traverse ancestors to find a matching opening tag
1873
+ // Stop at root node
1874
+ while (($this->parent->parent)
1875
+ && strtolower($this->parent->tag) !== $tag_lower
1876
+ ) {
1877
+ $this->parent = $this->parent->parent;
1878
+ }
1879
+
1880
+ // If we don't have a match add current tag as text node
1881
+ if (strtolower($this->parent->tag) !== $tag_lower) {
1882
+ $this->parent = $org_parent; // restore origonal parent
1883
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1884
+ return $this->as_text_node($tag);
1885
+ }
1886
+ } elseif (($this->parent->parent)
1887
+ && strtolower($this->parent->parent->tag) === $tag_lower
1888
+ ) { // Grandparent exists and current tag closes it
1889
+ $this->parent->_[HDOM_INFO_END] = 0;
1890
+ $this->parent = $this->parent->parent;
1891
+ } else { // Random tag, add as text node
1892
+ return $this->as_text_node($tag);
1893
+ }
1894
+ }
1895
+
1896
+ // Set end position of parent tag to current cursor position
1897
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1898
+
1899
+ if ($this->parent->parent) {
1900
+ $this->parent = $this->parent->parent;
1901
+ }
1902
+
1903
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1904
+ return true;
1905
+ }
1906
+
1907
+ // start tag
1908
+ $node = new simple_html_dom_node($this);
1909
+ $node->_[HDOM_INFO_BEGIN] = $this->cursor;
1910
+ ++$this->cursor;
1911
+ $tag = $this->copy_until($this->token_slash); // Get tag name
1912
+ $node->tag_start = $begin_tag_pos;
1913
+
1914
+ // doctype, cdata & comments...
1915
+ // <!DOCTYPE html>
1916
+ // <![CDATA[ ... ]]>
1917
+ // <!-- Comment -->
1918
+ if (isset($tag[0]) && $tag[0] === '!') {
1919
+ $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1920
+
1921
+ if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
1922
+ $node->nodetype = HDOM_TYPE_COMMENT;
1923
+ $node->tag = 'comment';
1924
+ } else { // Could be doctype or CDATA but we don't care
1925
+ $node->nodetype = HDOM_TYPE_UNKNOWN;
1926
+ $node->tag = 'unknown';
1927
+ }
1928
+
1929
+ if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1930
+
1931
+ $this->link_nodes($node, true);
1932
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1933
+ return true;
1934
+ }
1935
+
1936
+ // The start tag cannot contain another start tag, if so add as text
1937
+ // i.e. "<<html>"
1938
+ if ($pos = strpos($tag, '<') !== false) {
1939
+ $tag = '<' . substr($tag, 0, -1);
1940
+ $node->_[HDOM_INFO_TEXT] = $tag;
1941
+ $this->link_nodes($node, false);
1942
+ $this->char = $this->doc[--$this->pos]; // prev
1943
+ return true;
1944
+ }
1945
+
1946
+ // Handle invalid tag names (i.e. "<html#doc>")
1947
+ if (!preg_match('/^\w[\w:-]*$/', $tag)) {
1948
+ $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1949
+
1950
+ // Next char is the beginning of a new tag, don't touch it.
1951
+ if ($this->char === '<') {
1952
+ $this->link_nodes($node, false);
1953
+ return true;
1954
+ }
1955
+
1956
+ // Next char closes current tag, add and be done with it.
1957
+ if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1958
+ $this->link_nodes($node, false);
1959
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1960
+ return true;
1961
+ }
1962
+
1963
+ // begin tag, add new node
1964
+ $node->nodetype = HDOM_TYPE_ELEMENT;
1965
+ $tag_lower = strtolower($tag);
1966
+ $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1967
+
1968
+ // handle optional closing tags
1969
+ if (isset($this->optional_closing_tags[$tag_lower])) {
1970
+ // Traverse ancestors to close all optional closing tags
1971
+ while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
1972
+ $this->parent->_[HDOM_INFO_END] = 0;
1973
+ $this->parent = $this->parent->parent;
1974
+ }
1975
+ $node->parent = $this->parent;
1976
+ }
1977
+
1978
+ $guard = 0; // prevent infinity loop
1979
+
1980
+ // [0] Space between tag and first attribute
1981
+ $space = array($this->copy_skip($this->token_blank), '', '');
1982
+
1983
+ // attributes
1984
+ do {
1985
+ // Everything until the first equal sign should be the attribute name
1986
+ $name = $this->copy_until($this->token_equal);
1987
+
1988
+ if ($name === '' && $this->char !== null && $space[0] === '') {
1989
+ break;
1990
+ }
1991
+
1992
+ if ($guard === $this->pos) { // Escape infinite loop
1993
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1994
+ continue;
1995
+ }
1996
+
1997
+ $guard = $this->pos;
1998
+
1999
+ // handle endless '<'
2000
+ // Out of bounds before the tag ended
2001
+ if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2002
+ $node->nodetype = HDOM_TYPE_TEXT;
2003
+ $node->_[HDOM_INFO_END] = 0;
2004
+ $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2005
+ $node->tag = 'text';
2006
+ $this->link_nodes($node, false);
2007
+ return true;
2008
+ }
2009
+
2010
+ // handle mismatch '<'
2011
+ // Attributes cannot start after opening tag
2012
+ if ($this->doc[$this->pos - 1] == '<') {
2013
+ $node->nodetype = HDOM_TYPE_TEXT;
2014
+ $node->tag = 'text';
2015
+ $node->attr = array();
2016
+ $node->_[HDOM_INFO_END] = 0;
2017
+ $node->_[HDOM_INFO_TEXT] = substr(
2018
+ $this->doc,
2019
+ $begin_tag_pos,
2020
+ $this->pos - $begin_tag_pos - 1
2021
+ );
2022
+ $this->pos -= 2;
2023
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2024
+ $this->link_nodes($node, false);
2025
+ return true;
2026
+ }
2027
+
2028
+ if ($name !== '/' && $name !== '') { // this is a attribute name
2029
+ // [1] Whitespace after attribute name
2030
+ $space[1] = $this->copy_skip($this->token_blank);
2031
+
2032
+ $name = $this->restore_noise($name); // might be a noisy name
2033
+
2034
+ if ($this->lowercase) { $name = strtolower($name); }
2035
+
2036
+ if ($this->char === '=') { // attribute with value
2037
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2038
+ $this->parse_attr($node, $name, $space); // get attribute value
2039
+ } else {
2040
+ //no value attr: nowrap, checked selected...
2041
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2042
+ $node->attr[$name] = true;
2043
+ if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2044
+ }
2045
+
2046
+ $node->_[HDOM_INFO_SPACE][] = $space;
2047
+
2048
+ // prepare for next attribute
2049
+ $space = array(
2050
+ $this->copy_skip($this->token_blank),
2051
+ '',
2052
+ ''
2053
+ );
2054
+ } else { // no more attributes
2055
+ break;
2056
+ }
2057
+ } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2058
+
2059
+ $this->link_nodes($node, true);
2060
+ $node->_[HDOM_INFO_ENDSPACE] = $space[0];
2061
+
2062
+ // handle empty tags (i.e. "<div/>")
2063
+ if ($this->copy_until_char('>') === '/') {
2064
+ $node->_[HDOM_INFO_ENDSPACE] .= '/';
2065
+ $node->_[HDOM_INFO_END] = 0;
2066
+ } else {
2067
+ // reset parent
2068
+ if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2069
+ $this->parent = $node;
2070
+ }
2071
+ }
2072
+
2073
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2074
+
2075
+ // If it's a BR tag, we need to set it's text to the default text.
2076
+ // This way when we see it in plaintext, we can generate formatting that the user wants.
2077
+ // since a br tag never has sub nodes, this works well.
2078
+ if ($node->tag === 'br') {
2079
+ $node->_[HDOM_INFO_INNER] = $this->default_br_text;
2080
+ }
2081
+
2082
+ return true;
2083
+ }
2084
+
2085
+ protected function parse_attr($node, $name, &$space)
2086
+ {
2087
+ $is_duplicate = isset($node->attr[$name]);
2088
+
2089
+ if (!$is_duplicate) // Copy whitespace between "=" and value
2090
+ $space[2] = $this->copy_skip($this->token_blank);
2091
+
2092
+ switch ($this->char) {
2093
+ case '"':
2094
+ $quote_type = HDOM_QUOTE_DOUBLE;
2095
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2096
+ $value = $this->copy_until_char('"');
2097
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2098
+ break;
2099
+ case '\'':
2100
+ $quote_type = HDOM_QUOTE_SINGLE;
2101
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2102
+ $value = $this->copy_until_char('\'');
2103
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2104
+ break;
2105
+ default:
2106
+ $quote_type = HDOM_QUOTE_NO;
2107
+ $value = $this->copy_until($this->token_attr);
2108
+ }
2109
+
2110
+ $value = $this->restore_noise($value);
2111
+
2112
+ // PaperG: Attributes should not have \r or \n in them, that counts as
2113
+ // html whitespace.
2114
+ $value = str_replace("\r", '', $value);
2115
+ $value = str_replace("\n", '', $value);
2116
+
2117
+ // PaperG: If this is a "class" selector, lets get rid of the preceeding
2118
+ // and trailing space since some people leave it in the multi class case.
2119
+ if ($name === 'class') {
2120
+ $value = trim($value);
2121
+ }
2122
+
2123
+ if (!$is_duplicate) {
2124
+ $node->_[HDOM_INFO_QUOTE][] = $quote_type;
2125
+ $node->attr[$name] = $value;
2126
+ }
2127
+ }
2128
+
2129
+ protected function link_nodes(&$node, $is_child)
2130
+ {
2131
+ $node->parent = $this->parent;
2132
+ $this->parent->nodes[] = $node;
2133
+ if ($is_child) {
2134
+ $this->parent->children[] = $node;
2135
+ }
2136
+ }
2137
+
2138
+ protected function as_text_node($tag)
2139
+ {
2140
+ $node = new simple_html_dom_node($this);
2141
+ ++$this->cursor;
2142
+ $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2143
+ $this->link_nodes($node, false);
2144
+ $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2145
+ return true;
2146
+ }
2147
+
2148
+ protected function skip($chars)
2149
+ {
2150
+ $this->pos += strspn($this->doc, $chars, $this->pos);
2151
+ $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2152
+ }
2153
+
2154
+ protected function copy_skip($chars)
2155
+ {
2156
+ $pos = $this->pos;
2157
+ $len = strspn($this->doc, $chars, $pos);
2158
+ $this->pos += $len;
2159
+ $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2160
+ if ($len === 0) { return ''; }
2161
+ return substr($this->doc, $pos, $len);
2162
+ }
2163
+
2164
+ protected function copy_until($chars)
2165
+ {
2166
+ $pos = $this->pos;
2167
+ $len = strcspn($this->doc, $chars, $pos);
2168
+ $this->pos += $len;
2169
+ $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2170
+ return substr($this->doc, $pos, $len);
2171
+ }
2172
+
2173
+ protected function copy_until_char($char)
2174
+ {
2175
+ if ($this->char === null) { return ''; }
2176
+
2177
+ if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2178
+ $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2179
+ $this->char = null;
2180
+ $this->pos = $this->size;
2181
+ return $ret;
2182
+ }
2183
+
2184
+ if ($pos === $this->pos) { return ''; }
2185
+
2186
+ $pos_old = $this->pos;
2187
+ $this->char = $this->doc[$pos];
2188
+ $this->pos = $pos;
2189
+ return substr($this->doc, $pos_old, $pos - $pos_old);
2190
+ }
2191
+
2192
+ protected function remove_noise($pattern, $remove_tag = false)
2193
+ {
2194
+ global $debug_object;
2195
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2196
+
2197
+ $count = preg_match_all(
2198
+ $pattern,
2199
+ $this->doc,
2200
+ $matches,
2201
+ PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2202
+ );
2203
+
2204
+ for ($i = $count - 1; $i > -1; --$i) {
2205
+ $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2206
+
2207
+ if (is_object($debug_object)) {
2208
+ $debug_object->debug_log(2, 'key is: ' . $key);
2209
+ }
2210
+
2211
+ $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2212
+ $this->noise[$key] = $matches[$i][$idx][0];
2213
+ $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2214
+ }
2215
+
2216
+ // reset the length of content
2217
+ $this->size = strlen($this->doc);
2218
+
2219
+ if ($this->size > 0) {
2220
+ $this->char = $this->doc[0];
2221
+ }
2222
+ }
2223
+
2224
+ function restore_noise($text)
2225
+ {
2226
+ global $debug_object;
2227
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2228
+
2229
+ while (($pos = strpos($text, '___noise___')) !== false) {
2230
+ // Sometimes there is a broken piece of markup, and we don't GET the
2231
+ // pos+11 etc... token which indicates a problem outside of us...
2232
+
2233
+ // todo: "___noise___1000" (or any number with four or more digits)
2234
+ // in the DOM causes an infinite loop which could be utilized by
2235
+ // malicious software
2236
+ if (strlen($text) > $pos + 15) {
2237
+ $key = '___noise___'
2238
+ . $text[$pos + 11]
2239
+ . $text[$pos + 12]
2240
+ . $text[$pos + 13]
2241
+ . $text[$pos + 14]
2242
+ . $text[$pos + 15];
2243
+
2244
+ if (is_object($debug_object)) {
2245
+ $debug_object->debug_log(2, 'located key of: ' . $key);
2246
+ }
2247
+
2248
+ if (isset($this->noise[$key])) {
2249
+ $text = substr($text, 0, $pos)
2250
+ . $this->noise[$key]
2251
+ . substr($text, $pos + 16);
2252
+ } else {
2253
+ // do this to prevent an infinite loop.
2254
+ $text = substr($text, 0, $pos)
2255
+ . 'UNDEFINED NOISE FOR KEY: '
2256
+ . $key
2257
+ . substr($text, $pos + 16);
2258
+ }
2259
+ } else {
2260
+ // There is no valid key being given back to us... We must get
2261
+ // rid of the ___noise___ or we will have a problem.
2262
+ $text = substr($text, 0, $pos)
2263
+ . 'NO NUMERIC NOISE KEY'
2264
+ . substr($text, $pos + 11);
2265
+ }
2266
+ }
2267
+ return $text;
2268
+ }
2269
+
2270
+ function search_noise($text)
2271
+ {
2272
+ global $debug_object;
2273
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2274
+
2275
+ foreach($this->noise as $noiseElement) {
2276
+ if (strpos($noiseElement, $text) !== false) {
2277
+ return $noiseElement;
2278
+ }
2279
+ }
2280
+ }
2281
+
2282
+ function __toString()
2283
+ {
2284
+ return $this->root->innertext();
2285
+ }
2286
+
2287
+ function __get($name)
2288
+ {
2289
+ switch ($name) {
2290
+ case 'outertext':
2291
+ return $this->root->innertext();
2292
+ case 'innertext':
2293
+ return $this->root->innertext();
2294
+ case 'plaintext':
2295
+ return $this->root->text();
2296
+ case 'charset':
2297
+ return $this->_charset;
2298
+ case 'target_charset':
2299
+ return $this->_target_charset;
2300
+ }
2301
+ }
2302
+
2303
+ function childNodes($idx = -1)
2304
+ {
2305
+ return $this->root->childNodes($idx);
2306
+ }
2307
+
2308
+ function firstChild()
2309
+ {
2310
+ return $this->root->first_child();
2311
+ }
2312
+
2313
+ function lastChild()
2314
+ {
2315
+ return $this->root->last_child();
2316
+ }
2317
+
2318
+ function createElement($name, $value = null)
2319
+ {
2320
+ return @str_get_html("<$name>$value</$name>")->firstChild();
2321
+ }
2322
+
2323
+ function createTextNode($value)
2324
+ {
2325
+ return @end(str_get_html($value)->nodes);
2326
+ }
2327
+
2328
+ function getElementById($id)
2329
+ {
2330
+ return $this->find("#$id", 0);
2331
+ }
2332
+
2333
+ function getElementsById($id, $idx = null)
2334
+ {
2335
+ return $this->find("#$id", $idx);
2336
+ }
2337
+
2338
+ function getElementByTagName($name)
2339
+ {
2340
+ return $this->find($name, 0);
2341
+ }
2342
+
2343
+ function getElementsByTagName($name, $idx = -1)
2344
+ {
2345
+ return $this->find($name, $idx);
2346
+ }
2347
+
2348
+ function loadFile()
2349
+ {
2350
+ $args = func_get_args();
2351
+ $this->load_file($args);
2352
+ }
2353
+ }