Gallery Custom Links - Version 1.0.9

Version Description

  • Info: This plugin is way more work than expected, please motivate me by giving it a good review and also my trying my other plugins (https://meowapps.com). Thanks a lot for that :) I will do my best to make this plugin perfect.
  • Update: Avoid interfering at all with all Ajax/Rest requests.
  • Update: Use HtmlDomParser instead of DiDom (this parser is less sensitive to badly formatted HTML). I am also thinking of doing it all with regexp, but let's see that later.
  • Info: Sorry for the last bunch of updates, some way of modifying HTML works for some, and not for others, and I am still trying to find a solution which works for everyone.
Download this release

Release Info

Developer TigrouMeow
Plugin Icon 128x128 Gallery Custom Links
Version 1.0.9
Comparing to
See all releases

Code changes from version 1.0.8 to 1.0.9

composer.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "require": {
3
- "imangazaliev/didom": "^1.14"
 
4
  }
5
  }
1
  {
2
  "require": {
3
+ "imangazaliev/didom": "^1.14",
4
+ "kub-at/php-simple-html-dom-parser": "^1.7"
5
  }
6
  }
composer.lock CHANGED
@@ -4,7 +4,7 @@
4
  "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
5
  "This file is @generated automatically"
6
  ],
7
- "content-hash": "dd2d693feb08c59f7273fa9e59333cd3",
8
  "packages": [
9
  {
10
  "name": "imangazaliev/didom",
@@ -54,6 +54,52 @@
54
  "xml"
55
  ],
56
  "time": "2019-01-17T11:01:36+00:00"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
  ],
59
  "packages-dev": [],
4
  "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
5
  "This file is @generated automatically"
6
  ],
7
+ "content-hash": "580c75a9e32770a65f6b4d397a87b25d",
8
  "packages": [
9
  {
10
  "name": "imangazaliev/didom",
54
  "xml"
55
  ],
56
  "time": "2019-01-17T11:01:36+00:00"
57
+ },
58
+ {
59
+ "name": "kub-at/php-simple-html-dom-parser",
60
+ "version": "1.7.1",
61
+ "source": {
62
+ "type": "git",
63
+ "url": "https://github.com/Kub-AT/php-simple-html-dom-parser.git",
64
+ "reference": "7a745b20157efb0f1be3021394769bd6b8e9ed4e"
65
+ },
66
+ "dist": {
67
+ "type": "zip",
68
+ "url": "https://api.github.com/repos/Kub-AT/php-simple-html-dom-parser/zipball/7a745b20157efb0f1be3021394769bd6b8e9ed4e",
69
+ "reference": "7a745b20157efb0f1be3021394769bd6b8e9ed4e",
70
+ "shasum": ""
71
+ },
72
+ "require": {
73
+ "php": ">=5.3.2"
74
+ },
75
+ "type": "library",
76
+ "autoload": {
77
+ "psr-0": {
78
+ "KubAT\\PhpSimple\\HtmlDomParser": "src/"
79
+ }
80
+ },
81
+ "notification-url": "https://packagist.org/downloads/",
82
+ "license": [
83
+ "MIT"
84
+ ],
85
+ "authors": [
86
+ {
87
+ "name": "S.C. Chen",
88
+ "email": "me578022@gmail.com"
89
+ },
90
+ {
91
+ "name": "Jakub Stawowy",
92
+ "email": "Kub-AT@users.noreply.github.com"
93
+ }
94
+ ],
95
+ "description": "PHP Simple HTML DOM Parser with namespace and PHP 7.3 compatible",
96
+ "homepage": "http://simplehtmldom.sourceforge.net/",
97
+ "keywords": [
98
+ "Simple",
99
+ "dom",
100
+ "html"
101
+ ],
102
+ "time": "2019-01-02T14:33:28+00:00"
103
  }
104
  ],
105
  "packages-dev": [],
gallery_custom_links.php CHANGED
@@ -3,7 +3,7 @@
3
  Plugin Name: Gallery Custom Links
4
  Plugin URI: https://meowapps.com
5
  Description: Gallery Custom Links allows you to link images from galleries to a specified URL. Tested with WordPress Gallery, Gutenberg, the Meow Gallery and others.
6
- Version: 1.0.8
7
  Author: Jordy Meow
8
  Author URI: https://meowapps.com
9
  Text Domain: gallery-custom-links
@@ -23,7 +23,7 @@ if ( class_exists( 'Meow_Gallery_Custom_Links' ) ) {
23
  }
24
 
25
  global $mgcl_version;
26
- $mgcl_version = '1.0.8';
27
 
28
  include "mgcl_admin.php";
29
  $mgcl_admin = new Meow_Gallery_Custom_Links_Admin( 'mgcl', __FILE__, 'gallery-custom-links' );
3
  Plugin Name: Gallery Custom Links
4
  Plugin URI: https://meowapps.com
5
  Description: Gallery Custom Links allows you to link images from galleries to a specified URL. Tested with WordPress Gallery, Gutenberg, the Meow Gallery and others.
6
+ Version: 1.0.9
7
  Author: Jordy Meow
8
  Author URI: https://meowapps.com
9
  Text Domain: gallery-custom-links
23
  }
24
 
25
  global $mgcl_version;
26
+ $mgcl_version = '1.0.9';
27
 
28
  include "mgcl_admin.php";
29
  $mgcl_admin = new Meow_Gallery_Custom_Links_Admin( 'mgcl', __FILE__, 'gallery-custom-links' );
mgcl_core.php CHANGED
@@ -4,18 +4,23 @@ require_once 'vendor/autoload.php';
4
 
5
  use DiDom\Document;
6
  use DiDom\Element;
 
7
 
8
  class Meow_Gallery_Custom_Links
9
  {
10
  public $isEnabled = true;
11
- public $isOb = true;
 
 
 
12
 
13
  public function __construct() {
14
 
15
- if ( is_admin() )
16
  return;
 
17
 
18
- if ( $this->isOb ) {
19
  add_action( 'init', array( $this, 'start' ) );
20
  add_action( 'shutdown', array( $this, 'shutdown' ) );
21
  add_action( 'wp_footer', array( $this, 'unlink_lightboxes_script' ) ) ;
@@ -26,6 +31,24 @@ class Meow_Gallery_Custom_Links
26
  }
27
  }
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  function init() {
30
  //add_action( 'init', array( $this, 'init' ) );
31
  // We don't need this now, we go through all the images.
@@ -35,7 +58,7 @@ class Meow_Gallery_Custom_Links
35
 
36
  function start() {
37
  $this->isEnabled = apply_filters( 'gallery_custom_links_enabled', true );
38
- if ( $this->isEnabled && $this->isOb )
39
  ob_start( array( $this, "linkify" ) );
40
  }
41
 
@@ -82,40 +105,36 @@ class Meow_Gallery_Custom_Links
82
  if ( empty( $target ) )
83
  $target = '_self';
84
  $parent = $element->parent();
85
-
86
- // Let's look for the closest link tag enclosing the image
87
- $potentialLinkNode = $parent;
88
- $maxDepth = 10;
89
- do {
90
- if ( property_exists( $potentialLinkNode, 'tag' ) && $potentialLinkNode->tag === 'a' ) {
91
- $potentialLinkNode->attr( 'href', $url );
92
- $class = $potentialLinkNode->attr( 'class' );
93
- $class = empty( $class ) ? 'custom-link no-lightbox' : ( $class . ' custom-link no-lightbox' );
94
- $potentialLinkNode->attr( 'class', $class );
95
- $potentialLinkNode->attr( 'onclick', 'event.stopPropagation()' );
96
- $potentialLinkNode->attr( 'target', $target );
97
- return true;
98
  }
99
- if ( method_exists( $potentialLinkNode, 'parent' ) )
100
- $potentialLinkNode = $potentialLinkNode->parent();
101
- else
102
- break;
103
  }
104
- while ( $potentialLinkNode && $maxDepth-- >= 0 );
105
-
106
- // There is no link tag, so we add one and move the image under it
107
- if ( $parent->tag === 'figure' )
108
- $parent = $parent->parent();
109
- $a = new Element('a');
110
- $a->attr( 'href', $url );
111
- $a->attr( 'class', 'custom-link no-lightbox' );
112
- $a->attr( 'onclick', 'event.stopPropagation()' );
113
- $a->attr( 'target', $target );
114
- $a->appendChild( $parent->children() );
115
- foreach( $parent->children() as $img ) {
116
- $img->remove();
117
  }
118
- $parent->appendChild( $a );
119
  return true;
120
  }
121
  }
4
 
5
  use DiDom\Document;
6
  use DiDom\Element;
7
+ use KubAT\PhpSimple\HtmlDomParser;
8
 
9
  class Meow_Gallery_Custom_Links
10
  {
11
  public $isEnabled = true;
12
+ // use OB on the whole page, or only go through the the_content ($renderingMode will be ignored)
13
+ public $isObMode = true;
14
+ // 'HtmlDomParser' (less prone to break badly formatted HTML) or 'DiDom' (faster)
15
+ public $parsingEngine = 'HtmlDomParser';
16
 
17
  public function __construct() {
18
 
19
+ if ( is_admin() || $this->is_rest() ) {
20
  return;
21
+ }
22
 
23
+ if ( $this->isObMode ) {
24
  add_action( 'init', array( $this, 'start' ) );
25
  add_action( 'shutdown', array( $this, 'shutdown' ) );
26
  add_action( 'wp_footer', array( $this, 'unlink_lightboxes_script' ) ) ;
31
  }
32
  }
33
 
34
+ function is_rest() {
35
+ $prefix = rest_get_url_prefix( );
36
+ if ( defined( 'REST_REQUEST' ) && REST_REQUEST || isset( $_GET['rest_route'] ) &&
37
+ strpos( trim( $_GET['rest_route'], '\\/' ), $prefix , 0 ) === 0) {
38
+ return true;
39
+ }
40
+
41
+ // (#3)
42
+ $rest_url = wp_parse_url( site_url( $prefix ) );
43
+ $current_url = wp_parse_url( add_query_arg( array( ) ) );
44
+ return strpos( $current_url['path'], $rest_url['path'], 0 ) === 0;
45
+ }
46
+
47
+ function setOptions() {
48
+ $this->isObMode = get_option( 'mwl_obmode', $this->isObMode );
49
+ $this->parsingEngine = get_option( 'mwl_parsing_engine', $this->parsingEngine );
50
+ }
51
+
52
  function init() {
53
  //add_action( 'init', array( $this, 'init' ) );
54
  // We don't need this now, we go through all the images.
58
 
59
  function start() {
60
  $this->isEnabled = apply_filters( 'gallery_custom_links_enabled', true );
61
+ if ( $this->isEnabled && $this->isObMode )
62
  ob_start( array( $this, "linkify" ) );
63
  }
64
 
105
  if ( empty( $target ) )
106
  $target = '_self';
107
  $parent = $element->parent();
108
+ if ( $this->parsingEngine === 'HtmlDomParser' ) {
109
+ $src = $element->src;
110
+ $mglSrc = $element->{'mgl-src'};
111
+ if ( $parent->{'tag'} === 'figure' )
112
+ $parent = $parent->parent();
113
+ $a = new Element('a');
114
+ $a->{'href'} = $url;
115
+ $a->{'class'} = 'custom-link no-lightbox';
116
+ $a->{'onclick'} = 'event.stopPropagation()';
117
+ $a->{'target'} = $target;
118
+ $a->appendChild( $parent->children() );
119
+ foreach( $parent->children() as $img ) {
120
+ $img->remove();
121
  }
122
+ $parent->appendChild( $a );
 
 
 
123
  }
124
+ else {
125
+ if ( $parent->tag === 'figure' )
126
+ $parent = $parent->parent();
127
+ $a = new Element('a');
128
+ $a->attr( 'href', $url );
129
+ $a->attr( 'class', 'custom-link no-lightbox' );
130
+ $a->attr( 'onclick', 'event.stopPropagation()' );
131
+ $a->attr( 'target', $target );
132
+ $a->appendChild( $parent->children() );
133
+ foreach( $parent->children() as $img ) {
134
+ $img->remove();
135
+ }
136
+ $parent->appendChild( $a );
137
  }
 
138
  return true;
139
  }
140
  }
readme.txt CHANGED
@@ -1,10 +1,10 @@
1
  === Gallery Custom Links ===
2
  Contributors: TigrouMeow
3
  Tags: custom, links, gallery, gutenberg
4
- Requires at least: 4.9
5
- Tested up to: 5.0
6
  Requires PHP: 7.0
7
- Stable tag: 1.0.8
8
 
9
  Gallery Custom Links allows you to link images from galleries to a specified URL. Tested with WordPress Gallery, Gutenberg, the Meow Gallery and others.
10
 
@@ -41,6 +41,12 @@ Replace all the files. Nothing else to do.
41
 
42
  == Changelog ==
43
 
 
 
 
 
 
 
44
  = 1.0.8 =
45
  * Update: Back to OB, maybe there should be an option for that.
46
  * Update: Get all the images of the page/post content instead of within specific containers previously.
1
  === Gallery Custom Links ===
2
  Contributors: TigrouMeow
3
  Tags: custom, links, gallery, gutenberg
4
+ Requires at least: 5.0
5
+ Tested up to: 5.2
6
  Requires PHP: 7.0
7
+ Stable tag: 1.0.9
8
 
9
  Gallery Custom Links allows you to link images from galleries to a specified URL. Tested with WordPress Gallery, Gutenberg, the Meow Gallery and others.
10
 
41
 
42
  == Changelog ==
43
 
44
+ = 1.0.9 =
45
+ * Info: This plugin is way more work than expected, please motivate me by giving it a good review and also my trying my other plugins (https://meowapps.com). Thanks a lot for that :) I will do my best to make this plugin perfect.
46
+ * Update: Avoid interfering at all with all Ajax/Rest requests.
47
+ * Update: Use HtmlDomParser instead of DiDom (this parser is less sensitive to badly formatted HTML). I am also thinking of doing it all with regexp, but let's see that later.
48
+ * Info: Sorry for the last bunch of updates, some way of modifying HTML works for some, and not for others, and I am still trying to find a solution which works for everyone.
49
+
50
  = 1.0.8 =
51
  * Update: Back to OB, maybe there should be an option for that.
52
  * Update: Get all the images of the page/post content instead of within specific containers previously.
vendor/composer/autoload_namespaces.php CHANGED
@@ -6,4 +6,5 @@ $vendorDir = dirname(dirname(__FILE__));
6
  $baseDir = dirname($vendorDir);
7
 
8
  return array(
 
9
  );
6
  $baseDir = dirname($vendorDir);
7
 
8
  return array(
9
+ 'KubAT\\PhpSimple\\HtmlDomParser' => array($vendorDir . '/kub-at/php-simple-html-dom-parser/src'),
10
  );
vendor/composer/autoload_static.php CHANGED
@@ -20,11 +20,22 @@ class ComposerStaticInit2605e89cef9fc84e4fbf6431ef455676
20
  ),
21
  );
22
 
 
 
 
 
 
 
 
 
 
 
23
  public static function getInitializer(ClassLoader $loader)
24
  {
25
  return \Closure::bind(function () use ($loader) {
26
  $loader->prefixLengthsPsr4 = ComposerStaticInit2605e89cef9fc84e4fbf6431ef455676::$prefixLengthsPsr4;
27
  $loader->prefixDirsPsr4 = ComposerStaticInit2605e89cef9fc84e4fbf6431ef455676::$prefixDirsPsr4;
 
28
 
29
  }, null, ClassLoader::class);
30
  }
20
  ),
21
  );
22
 
23
+ public static $prefixesPsr0 = array (
24
+ 'K' =>
25
+ array (
26
+ 'KubAT\\PhpSimple\\HtmlDomParser' =>
27
+ array (
28
+ 0 => __DIR__ . '/..' . '/kub-at/php-simple-html-dom-parser/src',
29
+ ),
30
+ ),
31
+ );
32
+
33
  public static function getInitializer(ClassLoader $loader)
34
  {
35
  return \Closure::bind(function () use ($loader) {
36
  $loader->prefixLengthsPsr4 = ComposerStaticInit2605e89cef9fc84e4fbf6431ef455676::$prefixLengthsPsr4;
37
  $loader->prefixDirsPsr4 = ComposerStaticInit2605e89cef9fc84e4fbf6431ef455676::$prefixDirsPsr4;
38
+ $loader->prefixesPsr0 = ComposerStaticInit2605e89cef9fc84e4fbf6431ef455676::$prefixesPsr0;
39
 
40
  }, null, ClassLoader::class);
41
  }
vendor/composer/installed.json CHANGED
@@ -49,5 +49,53 @@
49
  "parser",
50
  "xml"
51
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
  ]
49
  "parser",
50
  "xml"
51
  ]
52
+ },
53
+ {
54
+ "name": "kub-at/php-simple-html-dom-parser",
55
+ "version": "1.7.1",
56
+ "version_normalized": "1.7.1.0",
57
+ "source": {
58
+ "type": "git",
59
+ "url": "https://github.com/Kub-AT/php-simple-html-dom-parser.git",
60
+ "reference": "7a745b20157efb0f1be3021394769bd6b8e9ed4e"
61
+ },
62
+ "dist": {
63
+ "type": "zip",
64
+ "url": "https://api.github.com/repos/Kub-AT/php-simple-html-dom-parser/zipball/7a745b20157efb0f1be3021394769bd6b8e9ed4e",
65
+ "reference": "7a745b20157efb0f1be3021394769bd6b8e9ed4e",
66
+ "shasum": ""
67
+ },
68
+ "require": {
69
+ "php": ">=5.3.2"
70
+ },
71
+ "time": "2019-01-02T14:33:28+00:00",
72
+ "type": "library",
73
+ "installation-source": "dist",
74
+ "autoload": {
75
+ "psr-0": {
76
+ "KubAT\\PhpSimple\\HtmlDomParser": "src/"
77
+ }
78
+ },
79
+ "notification-url": "https://packagist.org/downloads/",
80
+ "license": [
81
+ "MIT"
82
+ ],
83
+ "authors": [
84
+ {
85
+ "name": "S.C. Chen",
86
+ "email": "me578022@gmail.com"
87
+ },
88
+ {
89
+ "name": "Jakub Stawowy",
90
+ "email": "Kub-AT@users.noreply.github.com"
91
+ }
92
+ ],
93
+ "description": "PHP Simple HTML DOM Parser with namespace and PHP 7.3 compatible",
94
+ "homepage": "http://simplehtmldom.sourceforge.net/",
95
+ "keywords": [
96
+ "Simple",
97
+ "dom",
98
+ "html"
99
+ ]
100
  }
101
  ]
vendor/kub-at/php-simple-html-dom-parser/CONTRIBUTING.md ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ # Contributing
2
+
3
+ I'm not the maintainer of the PHP Simple HTML DOM Parser project (https://sourceforge.net/projects/simplehtmldom/)
vendor/kub-at/php-simple-html-dom-parser/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2019 Jakub Stawowy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
vendor/kub-at/php-simple-html-dom-parser/README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ php-simple-html-dom-parser
2
+ ==========================
3
+
4
+ Version 1.7.1 - PHP 7.3 campatible
5
+ Changelog: https://sourceforge.net/projects/simplehtmldom/files/simplehtmldom/1.7/
6
+
7
+
8
+ Install
9
+ -------
10
+
11
+ ```
12
+ composer require kub-at/php-simple-html-dom-parser
13
+ ```
14
+
15
+ Usage
16
+ -----
17
+
18
+ ```php
19
+ use KubAT\PhpSimple\HtmlDomParser;
20
+
21
+ ...
22
+ $dom = HtmlDomParser::str_get_html( $str );
23
+ or
24
+ $dom = HtmlDomParser::file_get_html( $file_name );
25
+
26
+ $elems = $dom->find($elem_name);
27
+ ...
28
+
29
+ ```
vendor/kub-at/php-simple-html-dom-parser/composer.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "kub-at/php-simple-html-dom-parser",
3
+ "description": "PHP Simple HTML DOM Parser with namespace and PHP 7.3 compatible",
4
+ "keywords": ["html", "dom", "simple"],
5
+ "homepage": "http://simplehtmldom.sourceforge.net/",
6
+ "type": "library",
7
+ "license": "MIT",
8
+ "authors": [
9
+ {
10
+ "name": "S.C. Chen",
11
+ "email": "me578022@gmail.com"
12
+ },
13
+ {
14
+ "name": "Jakub Stawowy",
15
+ "email": "Kub-AT@users.noreply.github.com"
16
+ }
17
+ ],
18
+ "require": {
19
+ "php": ">=5.3.2"
20
+ },
21
+ "autoload": {
22
+ "psr-0": { "KubAT\\PhpSimple\\HtmlDomParser": "src/" }
23
+ }
24
+ }
vendor/kub-at/php-simple-html-dom-parser/src/KubAT/PhpSimple/HtmlDomParser.php ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ namespace KubAT\PhpSimple;
3
+
4
+ require 'lib'.DIRECTORY_SEPARATOR.'simple_html_dom.php';
5
+
6
+
7
+ class HtmlDomParser {
8
+
9
+ static public function file_get_html() {
10
+ return call_user_func_array('\simple_html_dom\file_get_html' , func_get_args());
11
+ }
12
+
13
+ static public function str_get_html() {
14
+ return call_user_func_array('\simple_html_dom\str_get_html' , func_get_args());
15
+ }
16
+ }
vendor/kub-at/php-simple-html-dom-parser/src/KubAT/PhpSimple/lib/simple_html_dom.php ADDED
@@ -0,0 +1,2174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ namespace simple_html_dom;
3
+
4
+ /**
5
+ * Website: http://sourceforge.net/projects/simplehtmldom/
6
+ * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
7
+ * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
8
+ * Contributions by:
9
+ * Yousuke Kumakura (Attribute filters)
10
+ * Vadim Voituk (Negative indexes supports of "find" method)
11
+ * Antcs (Constructor with automatically load contents either text or file/url)
12
+ *
13
+ * all affected sections have comments starting with "PaperG"
14
+ *
15
+ * Paperg - Added case insensitive testing of the value of the selector.
16
+ * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
17
+ * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
18
+ * it will almost always be smaller by some amount.
19
+ * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
20
+ * but for most purposes, it's a really good estimation.
21
+ * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
22
+ * Allow the user to tell us how much they trust the html.
23
+ * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
24
+ * This allows for us to find tags based on the text they contain.
25
+ * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
26
+ * Paperg: added parse_charset so that we know about the character set of the source document.
27
+ * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
28
+ * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
29
+ *
30
+ * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
31
+ * PaperG (John Schlick) Added get_display_size for "IMG" tags.
32
+ *
33
+ * Licensed under The MIT License
34
+ * Redistributions of files must retain the above copyright notice.
35
+ *
36
+ * @author S.C. Chen <me578022@gmail.com>
37
+ * @author John Schlick
38
+ * @author Rus Carroll
39
+ * @version Rev. 1.7 (214)
40
+ * @package PlaceLocalInclude
41
+ * @subpackage simple_html_dom
42
+ */
43
+
44
+ /**
45
+ * All of the Defines for the classes below.
46
+ * @author S.C. Chen <me578022@gmail.com>
47
+ */
48
+ define('HDOM_TYPE_ELEMENT', 1);
49
+ define('HDOM_TYPE_COMMENT', 2);
50
+ define('HDOM_TYPE_TEXT', 3);
51
+ define('HDOM_TYPE_ENDTAG', 4);
52
+ define('HDOM_TYPE_ROOT', 5);
53
+ define('HDOM_TYPE_UNKNOWN', 6);
54
+ define('HDOM_QUOTE_DOUBLE', 0);
55
+ define('HDOM_QUOTE_SINGLE', 1);
56
+ define('HDOM_QUOTE_NO', 3);
57
+ define('HDOM_INFO_BEGIN', 0);
58
+ define('HDOM_INFO_END', 1);
59
+ define('HDOM_INFO_QUOTE', 2);
60
+ define('HDOM_INFO_SPACE', 3);
61
+ define('HDOM_INFO_TEXT', 4);
62
+ define('HDOM_INFO_INNER', 5);
63
+ define('HDOM_INFO_OUTER', 6);
64
+ define('HDOM_INFO_ENDSPACE',7);
65
+ define('DEFAULT_TARGET_CHARSET', 'UTF-8');
66
+ define('DEFAULT_BR_TEXT', "\r\n");
67
+ define('DEFAULT_SPAN_TEXT', " ");
68
+ define('MAX_FILE_SIZE', 600000);
69
+
70
+ /** Contents between curly braces "{" and "}" are interpreted as text */
71
+ define('HDOM_SMARTY_AS_TEXT', 1);
72
+
73
+ // helper functions
74
+ // -----------------------------------------------------------------------------
75
+ // get html dom from file
76
+ // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
77
+ function file_get_html($url, $use_include_path = false, $context=null, $offset = 0, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
78
+ {
79
+ // Ensure maximum length is greater than zero
80
+ if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
81
+
82
+ // We DO force the tags to be terminated.
83
+ $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
84
+ // For sourceforge users: uncomment the next line and comment the retrieve_url_contents line 2 lines down if it is not already done.
85
+ $contents = file_get_contents($url, $use_include_path, $context, $offset, $maxLen);
86
+ // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
87
+ //$contents = retrieve_url_contents($url);
88
+ if (empty($contents) || strlen($contents) > $maxLen)
89
+ {
90
+ return false;
91
+ }
92
+ // The second parameter can force the selectors to all be lowercase.
93
+ $dom->load($contents, $lowercase, $stripRN);
94
+ return $dom;
95
+ }
96
+
97
+ // get html dom from string
98
+ function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
99
+ {
100
+ $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
101
+ if (empty($str) || strlen($str) > MAX_FILE_SIZE)
102
+ {
103
+ $dom->clear();
104
+ return false;
105
+ }
106
+ $dom->load($str, $lowercase, $stripRN);
107
+ return $dom;
108
+ }
109
+
110
+ // dump html dom tree
111
+ function dump_html_tree($node, $show_attr=true, $deep=0)
112
+ {
113
+ $node->dump($node);
114
+ }
115
+
116
+
117
+ /**
118
+ * simple html dom node
119
+ * PaperG - added ability for "find" routine to lowercase the value of the selector.
120
+ * PaperG - added $tag_start to track the start position of the tag in the total byte index
121
+ *
122
+ * @package PlaceLocalInclude
123
+ */
124
+ class simple_html_dom_node
125
+ {
126
+ /**
127
+ * Node type
128
+ *
129
+ * Default is {@see HDOM_TYPE_TEXT}
130
+ *
131
+ * @var int
132
+ */
133
+ public $nodetype = HDOM_TYPE_TEXT;
134
+
135
+ /**
136
+ * Tag name
137
+ *
138
+ * Default is 'text'
139
+ *
140
+ * @var string
141
+ */
142
+ public $tag = 'text';
143
+
144
+ /**
145
+ * List of attributes
146
+ *
147
+ * @var array
148
+ */
149
+ public $attr = array();
150
+
151
+ /**
152
+ * List of child node objects
153
+ *
154
+ * @var array
155
+ */
156
+ public $children = array();
157
+ public $nodes = array();
158
+
159
+ /**
160
+ * The parent node object
161
+ *
162
+ * @var object|null
163
+ */
164
+ public $parent = null;
165
+
166
+ // The "info" array - see HDOM_INFO_... for what each element contains.
167
+ public $_ = array();
168
+
169
+ /**
170
+ * Start position of the tag in the document
171
+ *
172
+ * @var int
173
+ */
174
+ public $tag_start = 0;
175
+
176
+ /**
177
+ * The DOM object
178
+ *
179
+ * @var object|null
180
+ */
181
+ private $dom = null;
182
+
183
+ /**
184
+ * Construct new node object
185
+ *
186
+ * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
187
+ */
188
+ function __construct($dom)
189
+ {
190
+ $this->dom = $dom;
191
+ $dom->nodes[] = $this;
192
+ }
193
+
194
+ function __destruct()
195
+ {
196
+ $this->clear();
197
+ }
198
+
199
+ function __toString()
200
+ {
201
+ return $this->outertext();
202
+ }
203
+
204
+ // clean up memory due to php5 circular references memory leak...
205
+ function clear()
206
+ {
207
+ $this->dom = null;
208
+ $this->nodes = null;
209
+ $this->parent = null;
210
+ $this->children = null;
211
+ }
212
+
213
+ // dump node's tree
214
+ function dump($show_attr=true, $deep=0)
215
+ {
216
+ $lead = str_repeat(' ', $deep);
217
+
218
+ echo $lead.$this->tag;
219
+ if ($show_attr && count($this->attr)>0)
220
+ {
221
+ echo '(';
222
+ foreach ($this->attr as $k=>$v)
223
+ echo "[$k]=>\"".$this->$k.'", ';
224
+ echo ')';
225
+ }
226
+ echo "\n";
227
+
228
+ if ($this->nodes)
229
+ {
230
+ foreach ($this->nodes as $c)
231
+ {
232
+ $c->dump($show_attr, $deep+1);
233
+ }
234
+ }
235
+ }
236
+
237
+
238
+ // Debugging function to dump a single dom node with a bunch of information about it.
239
+ function dump_node($echo=true)
240
+ {
241
+
242
+ $string = $this->tag;
243
+ if (count($this->attr)>0)
244
+ {
245
+ $string .= '(';
246
+ foreach ($this->attr as $k=>$v)
247
+ {
248
+ $string .= "[$k]=>\"".$this->$k.'", ';
249
+ }
250
+ $string .= ')';
251
+ }
252
+ if (count($this->_)>0)
253
+ {
254
+ $string .= ' $_ (';
255
+ foreach ($this->_ as $k=>$v)
256
+ {
257
+ if (is_array($v))
258
+ {
259
+ $string .= "[$k]=>(";
260
+ foreach ($v as $k2=>$v2)
261
+ {
262
+ $string .= "[$k2]=>\"".$v2.'", ';
263
+ }
264
+ $string .= ")";
265
+ } else {
266
+ $string .= "[$k]=>\"".$v.'", ';
267
+ }
268
+ }
269
+ $string .= ")";
270
+ }
271
+
272
+ if (isset($this->text))
273
+ {
274
+ $string .= " text: (" . $this->text . ")";
275
+ }
276
+
277
+ $string .= " HDOM_INNER_INFO: '";
278
+ if (isset($node->_[HDOM_INFO_INNER]))
279
+ {
280
+ $string .= $node->_[HDOM_INFO_INNER] . "'";
281
+ }
282
+ else
283
+ {
284
+ $string .= ' NULL ';
285
+ }
286
+
287
+ $string .= " children: " . count($this->children);
288
+ $string .= " nodes: " . count($this->nodes);
289
+ $string .= " tag_start: " . $this->tag_start;
290
+ $string .= "\n";
291
+
292
+ if ($echo)
293
+ {
294
+ echo $string;
295
+ return;
296
+ }
297
+ else
298
+ {
299
+ return $string;
300
+ }
301
+ }
302
+
303
+ /**
304
+ * Return or set parent node
305
+ *
306
+ * @param object|null $parent (optional) The parent node, `null` to return
307
+ * the current parent node.
308
+ * @return object|null The parent node
309
+ */
310
+ function parent($parent=null)
311
+ {
312
+ // I am SURE that this doesn't work properly.
313
+ // It fails to unset the current node from it's current parents nodes or children list first.
314
+ if ($parent !== null)
315
+ {
316
+ $this->parent = $parent;
317
+ $this->parent->nodes[] = $this;
318
+ $this->parent->children[] = $this;
319
+ }
320
+
321
+ return $this->parent;
322
+ }
323
+
324
+ /**
325
+ * @return bool True if the node has at least one child node
326
+ */
327
+ function has_child()
328
+ {
329
+ return !empty($this->children);
330
+ }
331
+
332
+ /**
333
+ * Get child node at specified index
334
+ *
335
+ * @param int $idx The index of the child node to return, `-1` to return all
336
+ * child nodes.
337
+ * @return object|array|null The child node at the specified index, all child
338
+ * nodes or null if the index is invalid.
339
+ */
340
+ function children($idx=-1)
341
+ {
342
+ if ($idx===-1)
343
+ {
344
+ return $this->children;
345
+ }
346
+ if (isset($this->children[$idx]))
347
+ {
348
+ return $this->children[$idx];
349
+ }
350
+ return null;
351
+ }
352
+
353
+ /**
354
+ * Get first child node
355
+ *
356
+ * @return object|null The first child node or null if the current node has
357
+ * no child nodes.
358
+ *
359
+ * @todo Use `empty()` instead of `count()` to improve performance on large
360
+ * arrays.
361
+ */
362
+ function first_child()
363
+ {
364
+ if (count($this->children)>0)
365
+ {
366
+ return $this->children[0];
367
+ }
368
+ return null;
369
+ }
370
+
371
+ /**
372
+ * Get last child node
373
+ *
374
+ * @return object|null The last child node or null if the current node has
375
+ * no child nodes.
376
+ *
377
+ * @todo Use `end()` to slightly improve performance on large arrays.
378
+ */
379
+ function last_child()
380
+ {
381
+ if (($count=count($this->children))>0)
382
+ {
383
+ return $this->children[$count-1];
384
+ }
385
+ return null;
386
+ }
387
+
388
+ /**
389
+ * Get next sibling node
390
+ *
391
+ * @return object|null The sibling node or null if the current node has no
392
+ * sibling nodes.
393
+ */
394
+ function next_sibling()
395
+ {
396
+ if ($this->parent===null)
397
+ {
398
+ return null;
399
+ }
400
+
401
+ $idx = 0;
402
+ $count = count($this->parent->children);
403
+ while ($idx<$count && $this!==$this->parent->children[$idx])
404
+ {
405
+ ++$idx;
406
+ }
407
+ if (++$idx>=$count)
408
+ {
409
+ return null;
410
+ }
411
+ return $this->parent->children[$idx];
412
+ }
413
+
414
+ /**
415
+ * Get previous sibling node
416
+ *
417
+ * @return object|null The sibling node or null if the current node has no
418
+ * sibling nodes.
419
+ */
420
+ function prev_sibling()
421
+ {
422
+ if ($this->parent===null) return null;
423
+ $idx = 0;
424
+ $count = count($this->parent->children);
425
+ while ($idx<$count && $this!==$this->parent->children[$idx])
426
+ ++$idx;
427
+ if (--$idx<0) return null;
428
+ return $this->parent->children[$idx];
429
+ }
430
+
431
+ /**
432
+ * Traverse ancestors to the first matching tag.
433
+ *
434
+ * @param string $tag Tag to find
435
+ * @return object|null First matching node in the DOM tree or null if no
436
+ * match was found.
437
+ *
438
+ * @todo Null is returned implicitly by calling ->parent on the root node.
439
+ * This behaviour could change at any time, rendering this function invalid.
440
+ */
441
+ function find_ancestor_tag($tag)
442
+ {
443
+ global $debug_object;
444
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
445
+
446
+ // Start by including ourselves in the comparison.
447
+ $returnDom = $this;
448
+
449
+ while (!is_null($returnDom))
450
+ {
451
+ if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
452
+
453
+ if ($returnDom->tag == $tag)
454
+ {
455
+ break;
456
+ }
457
+ $returnDom = $returnDom->parent;
458
+ }
459
+ return $returnDom;
460
+ }
461
+
462
+ /**
463
+ * Get node's inner text (everything inside the opening and closing tags)
464
+ *
465
+ * @return string
466
+ */
467
+ function innertext()
468
+ {
469
+ if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
470
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
471
+
472
+ $ret = '';
473
+ foreach ($this->nodes as $n)
474
+ $ret .= $n->outertext();
475
+ return $ret;
476
+ }
477
+
478
+ /**
479
+ * Get node's outer text (everything including the opening and closing tags)
480
+ *
481
+ * @return string
482
+ */
483
+ function outertext()
484
+ {
485
+ global $debug_object;
486
+ if (is_object($debug_object))
487
+ {
488
+ $text = '';
489
+ if ($this->tag == 'text')
490
+ {
491
+ if (!empty($this->text))
492
+ {
493
+ $text = " with text: " . $this->text;
494
+ }
495
+ }
496
+ $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
497
+ }
498
+
499
+ if ($this->tag==='root') return $this->innertext();
500
+
501
+ // trigger callback
502
+ if ($this->dom && $this->dom->callback!==null)
503
+ {
504
+ call_user_func_array($this->dom->callback, array($this));
505
+ }
506
+
507
+ if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
508
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
509
+
510
+ // render begin tag
511
+ if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
512
+ {
513
+ $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
514
+ } else {
515
+ $ret = "";
516
+ }
517
+
518
+ // render inner text
519
+ if (isset($this->_[HDOM_INFO_INNER]))
520
+ {
521
+ // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
522
+ if ($this->tag != "br")
523
+ {
524
+ $ret .= $this->_[HDOM_INFO_INNER];
525
+ }
526
+ } else {
527
+ if ($this->nodes)
528
+ {
529
+ foreach ($this->nodes as $n)
530
+ {
531
+ $ret .= $this->convert_text($n->outertext());
532
+ }
533
+ }
534
+ }
535
+
536
+ // render end tag
537
+ if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
538
+ $ret .= '</'.$this->tag.'>';
539
+ return $ret;
540
+ }
541
+
542
+ /**
543
+ * Get node's plain text (everything excluding all tags)
544
+ *
545
+ * @return string
546
+ */
547
+ function text()
548
+ {
549
+ if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
550
+ switch ($this->nodetype)
551
+ {
552
+ case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
553
+ case HDOM_TYPE_COMMENT: return '';
554
+ case HDOM_TYPE_UNKNOWN: return '';
555
+ }
556
+ if (strcasecmp($this->tag, 'script')===0) return '';
557
+ if (strcasecmp($this->tag, 'style')===0) return '';
558
+
559
+ $ret = '';
560
+ // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
561
+ // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
562
+ // WHY is this happening?
563
+ if (!is_null($this->nodes))
564
+ {
565
+ foreach ($this->nodes as $n)
566
+ {
567
+ // Start paragraph after a blank line
568
+ if ($n->tag == 'p')
569
+ {
570
+ $ret .= "\n\n";
571
+ }
572
+
573
+ $ret .= $this->convert_text($n->text());
574
+
575
+ // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
576
+ if ($n->tag == "span")
577
+ {
578
+ $ret .= $this->dom->default_span_text;
579
+ }
580
+ }
581
+ }
582
+ return trim($ret);
583
+ }
584
+
585
+ /**
586
+ * Get node's xml text (inner text as a CDATA section)
587
+ *
588
+ * @return string
589
+ */
590
+ function xmltext()
591
+ {
592
+ $ret = $this->innertext();
593
+ $ret = str_ireplace('<![CDATA[', '', $ret);
594
+ $ret = str_replace(']]>', '', $ret);
595
+ return $ret;
596
+ }
597
+
598
+ // build node's text with tag
599
+ function makeup()
600
+ {
601
+ // text, comment, unknown
602
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
603
+
604
+ $ret = '<'.$this->tag;
605
+ $i = -1;
606
+
607
+ foreach ($this->attr as $key=>$val)
608
+ {
609
+ ++$i;
610
+
611
+ // skip removed attribute
612
+ if ($val===null || $val===false)
613
+ continue;
614
+
615
+ $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
616
+ //no value attr: nowrap, checked selected...
617
+ if ($val===true)
618
+ $ret .= $key;
619
+ else {
620
+ switch ($this->_[HDOM_INFO_QUOTE][$i])
621
+ {
622
+ case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
623
+ case HDOM_QUOTE_SINGLE: $quote = '\''; break;
624
+ default: $quote = '';
625
+ }
626
+ $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
627
+ }
628
+ }
629
+ $ret = $this->dom->restore_noise($ret);
630
+ return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
631
+ }
632
+
633
+ // find elements by css selector
634
+ //PaperG - added ability for find to lowercase the value of the selector.
635
+ function find($selector, $idx=null, $lowercase=false)
636
+ {
637
+ $selectors = $this->parse_selector($selector);
638
+ if (($count=count($selectors))===0) return array();
639
+ $found_keys = array();
640
+
641
+ // find each selector
642
+ for ($c=0; $c<$count; ++$c)
643
+ {
644
+ // The change on the below line was documented on the sourceforge code tracker id 2788009
645
+ // used to be: if (($levle=count($selectors[0]))===0) return array();
646
+ if (($levle=count($selectors[$c]))===0) return array();
647
+ if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
648
+
649
+ $head = array($this->_[HDOM_INFO_BEGIN]=>1);
650
+
651
+ // handle descendant selectors, no recursive!
652
+ for ($l=0; $l<$levle; ++$l)
653
+ {
654
+ $ret = array();
655
+ foreach ($head as $k=>$v)
656
+ {
657
+ $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
658
+ //PaperG - Pass this optional parameter on to the seek function.
659
+ $n->seek($selectors[$c][$l], $ret, $lowercase);
660
+ }
661
+ $head = $ret;
662
+ }
663
+
664
+ foreach ($head as $k=>$v)
665
+ {
666
+ if (!isset($found_keys[$k]))
667
+ {
668
+ $found_keys[$k] = 1;
669
+ }
670
+ }
671
+ }
672
+
673
+ // sort keys
674
+ ksort($found_keys);
675
+
676
+ $found = array();
677
+ foreach ($found_keys as $k=>$v)
678
+ $found[] = $this->dom->nodes[$k];
679
+
680
+ // return nth-element or array
681
+ if (is_null($idx)) return $found;
682
+ else if ($idx<0) $idx = count($found) + $idx;
683
+ return (isset($found[$idx])) ? $found[$idx] : null;
684
+ }
685
+
686
+ // seek for given conditions
687
+ // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
688
+ protected function seek($selector, &$ret, $lowercase=false)
689
+ {
690
+ global $debug_object;
691
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
692
+
693
+ list($tag, $key, $val, $exp, $no_key) = $selector;
694
+
695
+ // xpath index
696
+ if ($tag && $key && is_numeric($key))
697
+ {
698
+ $count = 0;
699
+ foreach ($this->children as $c)
700
+ {
701
+ if ($tag==='*' || $tag===$c->tag) {
702
+ if (++$count==$key) {
703
+ $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
704
+ return;
705
+ }
706
+ }
707
+ }
708
+ return;
709
+ }
710
+
711
+ $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
712
+ if ($end==0) {
713
+ $parent = $this->parent;
714
+ while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
715
+ $end -= 1;
716
+ $parent = $parent->parent;
717
+ }
718
+ $end += $parent->_[HDOM_INFO_END];
719
+ }
720
+
721
+ for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
722
+ $node = $this->dom->nodes[$i];
723
+
724
+ $pass = true;
725
+
726
+ if ($tag==='*' && !$key) {
727
+ if (in_array($node, $this->children, true))
728
+ $ret[$i] = 1;
729
+ continue;
730
+ }
731
+
732
+ // compare tag
733
+ if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
734
+ // compare key
735
+ if ($pass && $key) {
736
+ if ($no_key) {
737
+ if (isset($node->attr[$key])) $pass=false;
738
+ } else {
739
+ if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
740
+ }
741
+ }
742
+ // compare value
743
+ if ($pass && $key && $val && $val!=='*') {
744
+ // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
745
+ if ($key == "plaintext") {
746
+ // $node->plaintext actually returns $node->text();
747
+ $nodeKeyValue = $node->text();
748
+ } else {
749
+ // this is a normal search, we want the value of that attribute of the tag.
750
+ $nodeKeyValue = $node->attr[$key];
751
+ }
752
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
753
+
754
+ //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
755
+ if ($lowercase) {
756
+ $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
757
+ } else {
758
+ $check = $this->match($exp, $val, $nodeKeyValue);
759
+ }
760
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
761
+
762
+ // handle multiple class
763
+ if (!$check && strcasecmp($key, 'class')===0) {
764
+ foreach (explode(' ',$node->attr[$key]) as $k) {
765
+ // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
766
+ if (!empty($k)) {
767
+ if ($lowercase) {
768
+ $check = $this->match($exp, strtolower($val), strtolower($k));
769
+ } else {
770
+ $check = $this->match($exp, $val, $k);
771
+ }
772
+ if ($check) break;
773
+ }
774
+ }
775
+ }
776
+ if (!$check) $pass = false;
777
+ }
778
+ if ($pass) $ret[$i] = 1;
779
+ unset($node);
780
+ }
781
+ // It's passed by reference so this is actually what this function returns.
782
+ if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
783
+ }
784
+
785
+ protected function match($exp, $pattern, $value) {
786
+ global $debug_object;
787
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
788
+
789
+ switch ($exp) {
790
+ case '=':
791
+ return ($value===$pattern);
792
+ case '!=':
793
+ return ($value!==$pattern);
794
+ case '^=':
795
+ return preg_match("/^".preg_quote($pattern,'/')."/", $value);
796
+ case '$=':
797
+ return preg_match("/".preg_quote($pattern,'/')."$/", $value);
798
+ case '*=':
799
+ if ($pattern[0]=='/') {
800
+ return preg_match($pattern, $value);
801
+ }
802
+ return preg_match("/".$pattern."/i", $value);
803
+ }
804
+ return false;
805
+ }
806
+
807
+ protected function parse_selector($selector_string) {
808
+ global $debug_object;
809
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
810
+
811
+ // pattern of CSS selectors, modified from mootools
812
+ // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
813
+ // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
814
+ // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
815
+ // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
816
+ // farther study is required to determine of this should be documented or removed.
817
+ // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
818
+ $pattern = "/([\w:\*-]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
819
+ preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
820
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
821
+
822
+ $selectors = array();
823
+ $result = array();
824
+ //print_r($matches);
825
+
826
+ foreach ($matches as $m) {
827
+ $m[0] = trim($m[0]);
828
+ if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
829
+ // for browser generated xpath
830
+ if ($m[1]==='tbody') continue;
831
+
832
+ list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
833
+ if (!empty($m[2])) {$key='id'; $val=$m[2];}
834
+ if (!empty($m[3])) {$key='class'; $val=$m[3];}
835
+ if (!empty($m[4])) {$key=$m[4];}
836
+ if (!empty($m[5])) {$exp=$m[5];}
837
+ if (!empty($m[6])) {$val=$m[6];}
838
+
839
+ // convert to lowercase
840
+ if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
841
+ //elements that do NOT have the specified attribute
842
+ if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
843
+
844
+ $result[] = array($tag, $key, $val, $exp, $no_key);
845
+ if (trim($m[7])===',') {
846
+ $selectors[] = $result;
847
+ $result = array();
848
+ }
849
+ }
850
+ if (count($result)>0)
851
+ $selectors[] = $result;
852
+ return $selectors;
853
+ }
854
+
855
+ function __get($name)
856
+ {
857
+ if (isset($this->attr[$name]))
858
+ {
859
+ return $this->convert_text($this->attr[$name]);
860
+ }
861
+ switch ($name)
862
+ {
863
+ case 'outertext': return $this->outertext();
864
+ case 'innertext': return $this->innertext();
865
+ case 'plaintext': return $this->text();
866
+ case 'xmltext': return $this->xmltext();
867
+ default: return array_key_exists($name, $this->attr);
868
+ }
869
+ }
870
+
871
+ function __set($name, $value)
872
+ {
873
+ global $debug_object;
874
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
875
+
876
+ switch ($name)
877
+ {
878
+ case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
879
+ case 'innertext':
880
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
881
+ return $this->_[HDOM_INFO_INNER] = $value;
882
+ }
883
+ if (!isset($this->attr[$name]))
884
+ {
885
+ $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
886
+ $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
887
+ }
888
+ $this->attr[$name] = $value;
889
+ }
890
+
891
+ function __isset($name)
892
+ {
893
+ switch ($name)
894
+ {
895
+ case 'outertext': return true;
896
+ case 'innertext': return true;
897
+ case 'plaintext': return true;
898
+ }
899
+ //no value attr: nowrap, checked selected...
900
+ return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
901
+ }
902
+
903
+ function __unset($name) {
904
+ if (isset($this->attr[$name]))
905
+ unset($this->attr[$name]);
906
+ }
907
+
908
+ // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
909
+ function convert_text($text)
910
+ {
911
+ global $debug_object;
912
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
913
+
914
+ $converted_text = $text;
915
+
916
+ $sourceCharset = "";
917
+ $targetCharset = "";
918
+
919
+ if ($this->dom)
920
+ {
921
+ $sourceCharset = strtoupper($this->dom->_charset);
922
+ $targetCharset = strtoupper($this->dom->_target_charset);
923
+ }
924
+ if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
925
+
926
+ if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
927
+ {
928
+ // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
929
+ if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
930
+ {
931
+ $converted_text = $text;
932
+ }
933
+ else
934
+ {
935
+ $converted_text = iconv($sourceCharset, $targetCharset, $text);
936
+ }
937
+ }
938
+
939
+ // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
940
+ if ($targetCharset == 'UTF-8')
941
+ {
942
+ if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
943
+ {
944
+ $converted_text = substr($converted_text, 3);
945
+ }
946
+ if (substr($converted_text, -3) == "\xef\xbb\xbf")
947
+ {
948
+ $converted_text = substr($converted_text, 0, -3);
949
+ }
950
+ }
951
+
952
+ return $converted_text;
953
+ }
954
+
955
+ /**
956
+ * Returns true if $string is valid UTF-8 and false otherwise.
957
+ *
958
+ * @param mixed $str String to be tested
959
+ * @return boolean
960
+ */
961
+ static function is_utf8($str)
962
+ {
963
+ $c=0; $b=0;
964
+ $bits=0;
965
+ $len=strlen($str);
966
+ for($i=0; $i<$len; $i++)
967
+ {
968
+ $c=ord($str[$i]);
969
+ if($c > 128)
970
+ {
971
+ if(($c >= 254)) return false;
972
+ elseif($c >= 252) $bits=6;
973
+ elseif($c >= 248) $bits=5;
974
+ elseif($c >= 240) $bits=4;
975
+ elseif($c >= 224) $bits=3;
976
+ elseif($c >= 192) $bits=2;
977
+ else return false;
978
+ if(($i+$bits) > $len) return false;
979
+ while($bits > 1)
980
+ {
981
+ $i++;
982
+ $b=ord($str[$i]);
983
+ if($b < 128 || $b > 191) return false;
984
+ $bits--;
985
+ }
986
+ }
987
+ }
988
+ return true;
989
+ }
990
+ /*
991
+ function is_utf8($string)
992
+ {
993
+ //this is buggy
994
+ return (utf8_encode(utf8_decode($string)) == $string);
995
+ }
996
+ */
997
+
998
+ /**
999
+ * Function to try a few tricks to determine the displayed size of an img on the page.
1000
+ * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
1001
+ *
1002
+ * @author John Schlick
1003
+ * @version April 19 2012
1004
+ * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
1005
+ */
1006
+ function get_display_size()
1007
+ {
1008
+ global $debug_object;
1009
+
1010
+ $width = -1;
1011
+ $height = -1;
1012
+
1013
+ if ($this->tag !== 'img')
1014
+ {
1015
+ return false;
1016
+ }
1017
+
1018
+ // See if there is aheight or width attribute in the tag itself.
1019
+ if (isset($this->attr['width']))
1020
+ {
1021
+ $width = $this->attr['width'];
1022
+ }
1023
+
1024
+ if (isset($this->attr['height']))
1025
+ {
1026
+ $height = $this->attr['height'];
1027
+ }
1028
+
1029
+ // Now look for an inline style.
1030
+ if (isset($this->attr['style']))
1031
+ {
1032
+ // Thanks to user gnarf from stackoverflow for this regular expression.
1033
+ $attributes = array();
1034
+ preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
1035
+ foreach ($matches as $match) {
1036
+ $attributes[$match[1]] = $match[2];
1037
+ }
1038
+
1039
+ // If there is a width in the style attributes:
1040
+ if (isset($attributes['width']) && $width == -1)
1041
+ {
1042
+ // check that the last two characters are px (pixels)
1043
+ if (strtolower(substr($attributes['width'], -2)) == 'px')
1044
+ {
1045
+ $proposed_width = substr($attributes['width'], 0, -2);
1046
+ // Now make sure that it's an integer and not something stupid.
1047
+ if (filter_var($proposed_width, FILTER_VALIDATE_INT))
1048
+ {
1049
+ $width = $proposed_width;
1050
+ }
1051
+ }
1052
+ }
1053
+
1054
+ // If there is a width in the style attributes:
1055
+ if (isset($attributes['height']) && $height == -1)
1056
+ {
1057
+ // check that the last two characters are px (pixels)
1058
+ if (strtolower(substr($attributes['height'], -2)) == 'px')
1059
+ {
1060
+ $proposed_height = substr($attributes['height'], 0, -2);
1061
+ // Now make sure that it's an integer and not something stupid.
1062
+ if (filter_var($proposed_height, FILTER_VALIDATE_INT))
1063
+ {
1064
+ $height = $proposed_height;
1065
+ }
1066
+ }
1067
+ }
1068
+
1069
+ }
1070
+
1071
+ // Future enhancement:
1072
+ // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
1073
+
1074
+ // Far future enhancement
1075
+ // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
1076
+ // Note that in this case, the class or id will have the img subselector for it to apply to the image.
1077
+
1078
+ // ridiculously far future development
1079
+ // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
1080
+
1081
+ $result = array('height' => $height,
1082
+ 'width' => $width);
1083
+ return $result;
1084
+ }
1085
+
1086
+ // camel naming conventions
1087
+ function getAllAttributes() {return $this->attr;}
1088
+ function getAttribute($name) {return $this->__get($name);}
1089
+ function setAttribute($name, $value) {$this->__set($name, $value);}
1090
+ function hasAttribute($name) {return $this->__isset($name);}
1091
+ function removeAttribute($name) {$this->__set($name, null);}
1092
+ function getElementById($id) {return $this->find("#$id", 0);}
1093
+ function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1094
+ function getElementByTagName($name) {return $this->find($name, 0);}
1095
+ function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
1096
+ function parentNode() {return $this->parent();}
1097
+ function childNodes($idx=-1) {return $this->children($idx);}
1098
+ function firstChild() {return $this->first_child();}
1099
+ function lastChild() {return $this->last_child();}
1100
+ function nextSibling() {return $this->next_sibling();}
1101
+ function previousSibling() {return $this->prev_sibling();}
1102
+ function hasChildNodes() {return $this->has_child();}
1103
+ function nodeName() {return $this->tag;}
1104
+ function appendChild($node) {$node->parent($this); return $node;}
1105
+
1106
+ }
1107
+
1108
+ /**
1109
+ * simple html dom parser
1110
+ * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
1111
+ * Paperg - change $size from protected to public so we can easily access it
1112
+ * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
1113
+ *
1114
+ * @package PlaceLocalInclude
1115
+ */
1116
+ class simple_html_dom
1117
+ {
1118
+ /**
1119
+ * The root node of the document
1120
+ *
1121
+ * @var object
1122
+ */
1123
+ public $root = null;
1124
+
1125
+ /**
1126
+ * List of nodes in the current DOM
1127
+ *
1128
+ * @var array
1129
+ */
1130
+ public $nodes = array();
1131
+
1132
+ /**
1133
+ * Callback function to run for each element in the DOM.
1134
+ *
1135
+ * @var callable|null
1136
+ */
1137
+ public $callback = null;
1138
+
1139
+ /**
1140
+ * Indicates how tags and attributes are matched
1141
+ *
1142
+ * @var bool When set to **true** tags and attributes will be converted to
1143
+ * lowercase before matching.
1144
+ */
1145
+ public $lowercase = false;
1146
+
1147
+ /**
1148
+ * Original document size
1149
+ *
1150
+ * Holds the original document size.
1151
+ *
1152
+ * @var int
1153
+ */
1154
+ public $original_size;
1155
+
1156
+ /**
1157
+ * Current document size
1158
+ *
1159
+ * Holds the current document size. The document size is determined by the
1160
+ * string length of ({@see simple_html_dom::$doc}).
1161
+ *
1162
+ * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1163
+ *
1164
+ * @var int
1165
+ * */
1166
+ public $size;
1167
+
1168
+ /**
1169
+ * Current position in the document
1170
+ *
1171
+ * @var int
1172
+ */
1173
+ protected $pos;
1174
+
1175
+ /**
1176
+ * The document
1177
+ *
1178
+ * @var string
1179
+ */
1180
+ protected $doc;
1181
+
1182
+ /**
1183
+ * Current character
1184
+ *
1185
+ * Holds the current character at position {@see simple_html_dom::$pos} in
1186
+ * the document {@see simple_html_dom::$doc}
1187
+ *
1188
+ * _Note_: Using this variable is more efficient than calling `substr($doc, $pos, 1)`
1189
+ *
1190
+ * @var string
1191
+ */
1192
+ protected $char;
1193
+
1194
+ protected $cursor;
1195
+
1196
+ /**
1197
+ * Parent node of the next node detected by the parser
1198
+ *
1199
+ * @var object
1200
+ */
1201
+ protected $parent;
1202
+ protected $noise = array();
1203
+
1204
+ /**
1205
+ * Tokens considered blank in HTML
1206
+ *
1207
+ * @var string
1208
+ */
1209
+ protected $token_blank = " \t\r\n";
1210
+
1211
+ /**
1212
+ * Tokens to identify the equal sign for attributes, stopping either at the
1213
+ * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e.
1214
+ * "<html>")
1215
+ *
1216
+ * @var string
1217
+ */
1218
+ protected $token_equal = ' =/>';
1219
+
1220
+ /**
1221
+ * Tokens to identify the end of a tag name. A tag name either ends on the
1222
+ * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t")
1223
+ *
1224
+ * @var string
1225
+ */
1226
+ protected $token_slash = " />\r\n\t";
1227
+
1228
+ /**
1229
+ * Tokens to identify the end of an attribute
1230
+ *
1231
+ * @var string
1232
+ */
1233
+ protected $token_attr = ' >';
1234
+
1235
+ // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1236
+ public $_charset = '';
1237
+ public $_target_charset = '';
1238
+
1239
+ /**
1240
+ * Innertext for <br> elements
1241
+ *
1242
+ * @var string
1243
+ */
1244
+ protected $default_br_text = "";
1245
+
1246
+ /**
1247
+ * Suffix for <span> elements
1248
+ *
1249
+ * @var string
1250
+ */
1251
+ public $default_span_text = "";
1252
+
1253
+ /**
1254
+ * Defines a list of self-closing tags (Void elements) according to the HTML
1255
+ * Specification
1256
+ *
1257
+ * _Remarks_:
1258
+ * - Use `isset()` instead of `in_array()` on array elements to boost
1259
+ * performance about 30%
1260
+ * - Sort elements by name for better readability!
1261
+ *
1262
+ * @link https://www.w3.org/TR/html HTML Specification
1263
+ * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1264
+ */
1265
+ protected $self_closing_tags = array(
1266
+ 'area'=>1,
1267
+ 'base'=>1,
1268
+ 'br'=>1,
1269
+ 'col'=>1,
1270
+ 'embed'=>1,
1271
+ 'hr'=>1,
1272
+ 'img'=>1,
1273
+ 'input'=>1,
1274
+ 'link'=>1,
1275
+ 'meta'=>1,
1276
+ 'param'=>1,
1277
+ 'source'=>1,
1278
+ 'track'=>1,
1279
+ 'wbr'=>1
1280
+ );
1281
+
1282
+ /**
1283
+ * Defines a list of tags which - if closed - close all optional closing
1284
+ * elements within if they haven't been closed yet. (So, an element where
1285
+ * neither opening nor closing tag is omissible consistently closes every
1286
+ * optional closing element within)
1287
+ *
1288
+ * _Remarks_:
1289
+ * - Use `isset()` instead of `in_array()` on array elements to boost
1290
+ * performance about 30%
1291
+ * - Sort elements by name for better readability!
1292
+ */
1293
+ protected $block_tags = array(
1294
+ 'body'=>1,
1295
+ 'div'=>1,
1296
+ 'form'=>1,
1297
+ 'root'=>1,
1298
+ 'span'=>1,
1299
+ 'table'=>1
1300
+ );
1301
+
1302
+ /**
1303
+ * Defines elements whose end tag is omissible.
1304
+ *
1305
+ * * key = Name of an element whose end tag is omissible.
1306
+ * * value = Names of elements whose end tag is omissible, that are closed
1307
+ * by the current element.
1308
+ *
1309
+ * _Remarks_:
1310
+ * - Use `isset()` instead of `in_array()` on array elements to boost
1311
+ * performance about 30%
1312
+ * - Sort elements by name for better readability!
1313
+ *
1314
+ * **Example**
1315
+ *
1316
+ * An `li` element’s end tag may be omitted if the `li` element is immediately
1317
+ * followed by another `li` element. To do that, add following element to the
1318
+ * array:
1319
+ *
1320
+ * ```php
1321
+ * 'li' => array('li'),
1322
+ * ```
1323
+ *
1324
+ * With this, the following two examples are considered equal. Note that the
1325
+ * second example is missing the closing tags on `li` elements.
1326
+ *
1327
+ * ```html
1328
+ * <ul><li>First Item</li><li>Second Item</li></ul>
1329
+ * ```
1330
+ *
1331
+ * <ul><li>First Item</li><li>Second Item</li></ul>
1332
+ *
1333
+ * ```html
1334
+ * <ul><li>First Item<li>Second Item</ul>
1335
+ * ```
1336
+ *
1337
+ * <ul><li>First Item<li>Second Item</ul>
1338
+ *
1339
+ * @var array A two-dimensional array where the key is the name of an
1340
+ * element whose end tag is omissible and the value is an array of elements
1341
+ * whose end tag is omissible, that are closed by the current element.
1342
+ *
1343
+ * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1344
+ *
1345
+ * @todo The implementation of optional closing tags doesn't work in all cases
1346
+ * because it only consideres elements who close other optional closing
1347
+ * tags, not taking into account that some (non-blocking) tags should close
1348
+ * these optional closing tags. For example, the end tag for "p" is omissible
1349
+ * and can be closed by an "address" element, whose end tag is NOT omissible.
1350
+ * Currently a "p" element without closing tag stops at the next "p" element
1351
+ * or blocking tag, even if it contains other elements.
1352
+ *
1353
+ * @todo Known sourceforge issue #2977341
1354
+ * B tags that are not closed cause us to return everything to the end of
1355
+ * the document.
1356
+ */
1357
+ protected $optional_closing_tags = array(
1358
+ 'b'=>array('b'=>1), // Not optional, see https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1359
+ 'dd'=>array('dd'=>1, 'dt'=>1),
1360
+ 'dl'=>array('dd'=>1, 'dt'=>1), // Not optional, see https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1361
+ 'dt'=>array('dd'=>1, 'dt'=>1),
1362
+ 'li'=>array('li'=>1),
1363
+ 'optgroup'=>array('optgroup'=>1, 'option'=>1),
1364
+ 'option'=>array('optgroup'=>1, 'option'=>1),
1365
+ 'p'=>array('p'=>1),
1366
+ 'rp'=>array('rp'=>1, 'rt'=>1),
1367
+ 'rt'=>array('rp'=>1, 'rt'=>1),
1368
+ 'td'=>array('td'=>1, 'th'=>1),
1369
+ 'th'=>array('td'=>1, 'th'=>1),
1370
+ 'tr'=>array('td'=>1, 'th'=>1, 'tr'=>1),
1371
+ );
1372
+
1373
+ function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0)
1374
+ {
1375
+ if ($str)
1376
+ {
1377
+ if (preg_match("/^http:\/\//i",$str) || is_file($str))
1378
+ {
1379
+ $this->load_file($str);
1380
+ }
1381
+ else
1382
+ {
1383
+ $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options);
1384
+ }
1385
+ }
1386
+ // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1387
+ if (!$forceTagsClosed) {
1388
+ $this->optional_closing_array=array();
1389
+ }
1390
+ $this->_target_charset = $target_charset;
1391
+ }
1392
+
1393
+ function __destruct()
1394
+ {
1395
+ $this->clear();
1396
+ }
1397
+
1398
+ // load html from string
1399
+ function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0)
1400
+ {
1401
+ global $debug_object;
1402
+
1403
+ // prepare
1404
+ $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1405
+
1406
+ // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1407
+ // Script tags removal now preceeds style tag removal.
1408
+ // strip out <script> tags
1409
+ $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1410
+ $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1411
+
1412
+ // strip out the \r \n's if we are told to.
1413
+ if ($stripRN) {
1414
+ $this->doc = str_replace("\r", " ", $this->doc);
1415
+ $this->doc = str_replace("\n", " ", $this->doc);
1416
+
1417
+ // set the length of content since we have changed it.
1418
+ $this->size = strlen($this->doc);
1419
+ }
1420
+
1421
+ // strip out cdata
1422
+ $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1423
+ // strip out comments
1424
+ $this->remove_noise("'<!--(.*?)-->'is");
1425
+ // strip out <style> tags
1426
+ $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1427
+ $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1428
+ // strip out preformatted tags
1429
+ $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1430
+ // strip out server side scripts
1431
+ $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1432
+
1433
+ if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1434
+ $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1435
+ }
1436
+
1437
+ // parsing
1438
+ $this->parse();
1439
+ // end
1440
+ $this->root->_[HDOM_INFO_END] = $this->cursor;
1441
+ $this->parse_charset();
1442
+
1443
+ // make load function chainable
1444
+ return $this;
1445
+
1446
+ }
1447
+
1448
+ // load html from file
1449
+ function load_file()
1450
+ {
1451
+ $args = func_get_args();
1452
+
1453
+ if($doc = call_user_func_array('file_get_contents', $args) !== false) {
1454
+ $this->load($doc, true);
1455
+ } else {
1456
+ return false;
1457
+ }
1458
+ }
1459
+
1460
+ /**
1461
+ * Set the callback function
1462
+ *
1463
+ * @param callable $function_name Callback function to run for each element
1464
+ * in the DOM.
1465
+ * @return void
1466
+ */
1467
+ function set_callback($function_name)
1468
+ {
1469
+ $this->callback = $function_name;
1470
+ }
1471
+
1472
+ /**
1473
+ * Remove callback function
1474
+ *
1475
+ * @return void
1476
+ */
1477
+ function remove_callback()
1478
+ {
1479
+ $this->callback = null;
1480
+ }
1481
+
1482
+ // save dom as string
1483
+ function save($filepath='')
1484
+ {
1485
+ $ret = $this->root->innertext();
1486
+ if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1487
+ return $ret;
1488
+ }
1489
+
1490
+ // find dom node by css selector
1491
+ // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1492
+ function find($selector, $idx=null, $lowercase=false)
1493
+ {
1494
+ return $this->root->find($selector, $idx, $lowercase);
1495
+ }
1496
+
1497
+ // clean up memory due to php5 circular references memory leak...
1498
+ function clear()
1499
+ {
1500
+ foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1501
+ // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1502
+ if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1503
+ if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1504
+ if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1505
+ unset($this->doc);
1506
+ unset($this->noise);
1507
+ }
1508
+
1509
+ function dump($show_attr=true)
1510
+ {
1511
+ $this->root->dump($show_attr);
1512
+ }
1513
+
1514
+ // prepare HTML data and init everything
1515
+ protected function prepare($str, $lowercase=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1516
+ {
1517
+ $this->clear();
1518
+
1519
+ $this->doc = trim($str);
1520
+ $this->size = strlen($this->doc);
1521
+ $this->original_size = $this->size; // Save the original size of the html that we got in. It might be useful to someone.
1522
+ $this->pos = 0;
1523
+ $this->cursor = 1;
1524
+ $this->noise = array();
1525
+ $this->nodes = array();
1526
+ $this->lowercase = $lowercase;
1527
+ $this->default_br_text = $defaultBRText;
1528
+ $this->default_span_text = $defaultSpanText;
1529
+ $this->root = new simple_html_dom_node($this);
1530
+ $this->root->tag = 'root';
1531
+ $this->root->_[HDOM_INFO_BEGIN] = -1;
1532
+ $this->root->nodetype = HDOM_TYPE_ROOT;
1533
+ $this->parent = $this->root;
1534
+ if ($this->size>0) $this->char = $this->doc[0];
1535
+ }
1536
+
1537
+ /**
1538
+ * Parse HTML content
1539
+ *
1540
+ * @return bool True on success
1541
+ */
1542
+ protected function parse()
1543
+ {
1544
+ while (true) {
1545
+ // Read next tag if there is no text between current position and the
1546
+ // next opening tag.
1547
+ if (($s = $this->copy_until_char('<'))==='')
1548
+ {
1549
+ if($this->read_tag()) {
1550
+ continue;
1551
+ } else {
1552
+ return true;
1553
+ }
1554
+ }
1555
+
1556
+ // Add a text node for text between tags
1557
+ $node = new simple_html_dom_node($this);
1558
+ ++$this->cursor;
1559
+ $node->_[HDOM_INFO_TEXT] = $s;
1560
+ $this->link_nodes($node, false);
1561
+ }
1562
+ }
1563
+
1564
+ // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1565
+ // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1566
+ // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1567
+ protected function parse_charset()
1568
+ {
1569
+ global $debug_object;
1570
+
1571
+ $charset = null;
1572
+
1573
+ if (function_exists('get_last_retrieve_url_contents_content_type'))
1574
+ {
1575
+ $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1576
+ $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1577
+ if ($success)
1578
+ {
1579
+ $charset = $matches[1];
1580
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1581
+ }
1582
+
1583
+ }
1584
+
1585
+ if (empty($charset))
1586
+ {
1587
+ $el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1588
+ if (!empty($el))
1589
+ {
1590
+ $fullvalue = $el->content;
1591
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1592
+
1593
+ if (!empty($fullvalue))
1594
+ {
1595
+ $success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1596
+ if ($success)
1597
+ {
1598
+ $charset = $matches[1];
1599
+ }
1600
+ else
1601
+ {
1602
+ // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1603
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1604
+ $charset = 'ISO-8859-1';
1605
+ }
1606
+ }
1607
+ }
1608
+ }
1609
+
1610
+ // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1611
+ if (empty($charset))
1612
+ {
1613
+ // Use this in case mb_detect_charset isn't installed/loaded on this machine.
1614
+ $charset = false;
1615
+ if (function_exists('mb_detect_encoding'))
1616
+ {
1617
+ // Have php try to detect the encoding from the text given to us.
1618
+ $charset = mb_detect_encoding($this->doc . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1619
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1620
+ }
1621
+
1622
+ // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1623
+ if ($charset === false)
1624
+ {
1625
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1626
+ $charset = 'UTF-8';
1627
+ }
1628
+ }
1629
+
1630
+ // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1631
+ if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1632
+ {
1633
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1634
+ $charset = 'CP1252';
1635
+ }
1636
+
1637
+ if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1638
+
1639
+ return $this->_charset = $charset;
1640
+ }
1641
+
1642
+ /**
1643
+ * Parse tag from current document position.
1644
+ *
1645
+ * @return bool True if a tag was found, false otherwise
1646
+ */
1647
+ protected function read_tag()
1648
+ {
1649
+ // Set end position if no further tags found
1650
+ if ($this->char!=='<')
1651
+ {
1652
+ $this->root->_[HDOM_INFO_END] = $this->cursor;
1653
+ return false;
1654
+ }
1655
+ $begin_tag_pos = $this->pos;
1656
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1657
+
1658
+ // end tag
1659
+ if ($this->char==='/')
1660
+ {
1661
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1662
+
1663
+ // Skip whitespace in end tags (i.e. in "</ html>")
1664
+ $this->skip($this->token_blank);
1665
+ $tag = $this->copy_until_char('>');
1666
+
1667
+ // Skip attributes in end tags
1668
+ if (($pos = strpos($tag, ' '))!==false)
1669
+ $tag = substr($tag, 0, $pos);
1670
+
1671
+ $parent_lower = strtolower($this->parent->tag);
1672
+ $tag_lower = strtolower($tag);
1673
+
1674
+ // The end tag is supposed to close the parent tag. Handle situations
1675
+ // when it doesn't
1676
+ if ($parent_lower!==$tag_lower)
1677
+ {
1678
+ // Parent tag does not have to be closed necessarily (optional closing tag)
1679
+ // Current tag is a block tag, so it may close an ancestor
1680
+ if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1681
+ {
1682
+ $this->parent->_[HDOM_INFO_END] = 0;
1683
+ $org_parent = $this->parent;
1684
+
1685
+ // Traverse ancestors to find a matching opening tag
1686
+ // Stop at root node
1687
+ while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1688
+ $this->parent = $this->parent->parent;
1689
+
1690
+ // If we don't have a match add current tag as text node
1691
+ if (strtolower($this->parent->tag)!==$tag_lower) {
1692
+ $this->parent = $org_parent; // restore origonal parent
1693
+ if ($this->parent->parent) $this->parent = $this->parent->parent;
1694
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1695
+ return $this->as_text_node($tag);
1696
+ }
1697
+ }
1698
+ // Grandparent exists and current tag is a block tag, so our parent doesn't have an end tag
1699
+ else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1700
+ {
1701
+ $this->parent->_[HDOM_INFO_END] = 0; // No end tag
1702
+ $org_parent = $this->parent;
1703
+
1704
+ // Traverse ancestors to find a matching opening tag
1705
+ // Stop at root node
1706
+ while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1707
+ $this->parent = $this->parent->parent;
1708
+
1709
+ // If we don't have a match add current tag as text node
1710
+ if (strtolower($this->parent->tag)!==$tag_lower)
1711
+ {
1712
+ $this->parent = $org_parent; // restore origonal parent
1713
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1714
+ return $this->as_text_node($tag);
1715
+ }
1716
+ }
1717
+ // Grandparent exists and current tag closes it
1718
+ else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1719
+ {
1720
+ $this->parent->_[HDOM_INFO_END] = 0;
1721
+ $this->parent = $this->parent->parent;
1722
+ }
1723
+ else // Random tag, add as text node
1724
+ return $this->as_text_node($tag);
1725
+ }
1726
+
1727
+ // Set end position of parent tag to current cursor position
1728
+ $this->parent->_[HDOM_INFO_END] = $this->cursor;
1729
+ if ($this->parent->parent) $this->parent = $this->parent->parent;
1730
+
1731
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1732
+ return true;
1733
+ }
1734
+
1735
+ // start tag
1736
+ $node = new simple_html_dom_node($this);
1737
+ $node->_[HDOM_INFO_BEGIN] = $this->cursor;
1738
+ ++$this->cursor;
1739
+ $tag = $this->copy_until($this->token_slash); // Get tag name
1740
+ $node->tag_start = $begin_tag_pos;
1741
+
1742
+ // doctype, cdata & comments...
1743
+ // <!DOCTYPE html>
1744
+ // <![CDATA[ ... ]]>
1745
+ // <!-- Comment -->
1746
+ if (isset($tag[0]) && $tag[0]==='!') {
1747
+ $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1748
+
1749
+ if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { // Comment ("<!--")
1750
+ $node->nodetype = HDOM_TYPE_COMMENT;
1751
+ $node->tag = 'comment';
1752
+ } else { // Could be doctype or CDATA but we don't care
1753
+ $node->nodetype = HDOM_TYPE_UNKNOWN;
1754
+ $node->tag = 'unknown';
1755
+ }
1756
+ if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1757
+ $this->link_nodes($node, true);
1758
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1759
+ return true;
1760
+ }
1761
+
1762
+ // The start tag cannot contain another start tag, if so add as text
1763
+ // i.e. "<<html>"
1764
+ if ($pos=strpos($tag, '<')!==false) {
1765
+ $tag = '<' . substr($tag, 0, -1);
1766
+ $node->_[HDOM_INFO_TEXT] = $tag;
1767
+ $this->link_nodes($node, false);
1768
+ $this->char = $this->doc[--$this->pos]; // prev
1769
+ return true;
1770
+ }
1771
+
1772
+ // Handle invalid tag names (i.e. "<html#doc>")
1773
+ if (!preg_match("/^\w[\w:-]*$/", $tag)) {
1774
+ $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1775
+
1776
+ // Next char is the beginning of a new tag, don't touch it.
1777
+ if ($this->char==='<') {
1778
+ $this->link_nodes($node, false);
1779
+ return true;
1780
+ }
1781
+
1782
+ // Next char closes current tag, add and be done with it.
1783
+ if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1784
+ $this->link_nodes($node, false);
1785
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1786
+ return true;
1787
+ }
1788
+
1789
+ // begin tag, add new node
1790
+ $node->nodetype = HDOM_TYPE_ELEMENT;
1791
+ $tag_lower = strtolower($tag);
1792
+ $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1793
+
1794
+ // handle optional closing tags
1795
+ if (isset($this->optional_closing_tags[$tag_lower]) )
1796
+ {
1797
+ // Traverse ancestors to close all optional closing tags
1798
+ while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1799
+ {
1800
+ $this->parent->_[HDOM_INFO_END] = 0;
1801
+ $this->parent = $this->parent->parent;
1802
+ }
1803
+ $node->parent = $this->parent;
1804
+ }
1805
+
1806
+ $guard = 0; // prevent infinity loop
1807
+ $space = array($this->copy_skip($this->token_blank), '', ''); // [0] Space between tag and first attribute
1808
+
1809
+ // attributes
1810
+ do
1811
+ {
1812
+ // Everything until the first equal sign should be the attribute name
1813
+ $name = $this->copy_until($this->token_equal);
1814
+
1815
+ if ($name==='' && $this->char!==null && $space[0]==='')
1816
+ {
1817
+ break;
1818
+ }
1819
+
1820
+ if ($guard===$this->pos) // Escape infinite loop
1821
+ {
1822
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1823
+ continue;
1824
+ }
1825
+ $guard = $this->pos;
1826
+
1827
+ // handle endless '<'
1828
+ if ($this->pos>=$this->size-1 && $this->char!=='>') { // Out of bounds before the tag ended
1829
+ $node->nodetype = HDOM_TYPE_TEXT;
1830
+ $node->_[HDOM_INFO_END] = 0;
1831
+ $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1832
+ $node->tag = 'text';
1833
+ $this->link_nodes($node, false);
1834
+ return true;
1835
+ }
1836
+
1837
+ // handle mismatch '<'
1838
+ if ($this->doc[$this->pos-1]=='<') { // Attributes cannot start after opening tag
1839
+ $node->nodetype = HDOM_TYPE_TEXT;
1840
+ $node->tag = 'text';
1841
+ $node->attr = array();
1842
+ $node->_[HDOM_INFO_END] = 0;
1843
+ $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1844
+ $this->pos -= 2;
1845
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1846
+ $this->link_nodes($node, false);
1847
+ return true;
1848
+ }
1849
+
1850
+ if ($name!=='/' && $name!=='') { // this is a attribute name
1851
+ $space[1] = $this->copy_skip($this->token_blank); // [1] Whitespace after attribute name
1852
+ $name = $this->restore_noise($name); // might be a noisy name
1853
+ if ($this->lowercase) $name = strtolower($name);
1854
+ if ($this->char==='=') { // attribute with value
1855
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1856
+ $this->parse_attr($node, $name, $space); // get attribute value
1857
+ }
1858
+ else {
1859
+ //no value attr: nowrap, checked selected...
1860
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1861
+ $node->attr[$name] = true;
1862
+ if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1863
+ }
1864
+ $node->_[HDOM_INFO_SPACE][] = $space;
1865
+ $space = array($this->copy_skip($this->token_blank), '', ''); // prepare for next attribute
1866
+ }
1867
+ else // no more attributes
1868
+ break;
1869
+ } while ($this->char!=='>' && $this->char!=='/'); // go until the tag ended
1870
+
1871
+ $this->link_nodes($node, true);
1872
+ $node->_[HDOM_INFO_ENDSPACE] = $space[0];
1873
+
1874
+ // handle empty tags (i.e. "<div/>")
1875
+ if ($this->copy_until_char('>')==='/')
1876
+ {
1877
+ $node->_[HDOM_INFO_ENDSPACE] .= '/';
1878
+ $node->_[HDOM_INFO_END] = 0;
1879
+ }
1880
+ else
1881
+ {
1882
+ // reset parent
1883
+ if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1884
+ }
1885
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1886
+
1887
+ // If it's a BR tag, we need to set it's text to the default text.
1888
+ // This way when we see it in plaintext, we can generate formatting that the user wants.
1889
+ // since a br tag never has sub nodes, this works well.
1890
+ if ($node->tag == "br")
1891
+ {
1892
+ $node->_[HDOM_INFO_INNER] = $this->default_br_text;
1893
+ }
1894
+
1895
+ return true;
1896
+ }
1897
+
1898
+ /**
1899
+ * Parse attribute from current document position
1900
+ *
1901
+ * @param object $node Node for the attributes
1902
+ * @param string $name Name of the current attribute
1903
+ * @param array $space Array for spacing information
1904
+ * @return void
1905
+ */
1906
+ protected function parse_attr($node, $name, &$space)
1907
+ {
1908
+ // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1909
+ // If the attribute is already defined inside a tag, only pay attention to the first one as opposed to the last one.
1910
+ // https://stackoverflow.com/a/26341866
1911
+ if (isset($node->attr[$name]))
1912
+ {
1913
+ return;
1914
+ }
1915
+
1916
+ $space[2] = $this->copy_skip($this->token_blank); // [2] Whitespace between "=" and the value
1917
+ switch ($this->char) {
1918
+ case '"': // value is anything between double quotes
1919
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1920
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1921
+ $node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
1922
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1923
+ break;
1924
+ case '\'': // value is anything between single quotes
1925
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1926
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1927
+ $node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
1928
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1929
+ break;
1930
+ default: // value is anything until the first space or end tag
1931
+ $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1932
+ $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1933
+ }
1934
+ // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1935
+ $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1936
+ $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1937
+ // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1938
+ if ($name == "class") {
1939
+ $node->attr[$name] = trim($node->attr[$name]);
1940
+ }
1941
+ }
1942
+
1943
+ /**
1944
+ * Link node to parent node
1945
+ *
1946
+ * @param object $node Node to link to parent
1947
+ * @param bool $is_child True if the node is a child of parent
1948
+ * @return void
1949
+ */
1950
+ // link node's parent
1951
+ protected function link_nodes(&$node, $is_child)
1952
+ {
1953
+ $node->parent = $this->parent;
1954
+ $this->parent->nodes[] = $node;
1955
+ if ($is_child)
1956
+ {
1957
+ $this->parent->children[] = $node;
1958
+ }
1959
+ }
1960
+
1961
+ /**
1962
+ * Add tag as text node to current node
1963
+ *
1964
+ * @param string $tag Tag name
1965
+ * @return bool True on success
1966
+ */
1967
+ protected function as_text_node($tag)
1968
+ {
1969
+ $node = new simple_html_dom_node($this);
1970
+ ++$this->cursor;
1971
+ $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1972
+ $this->link_nodes($node, false);
1973
+ $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1974
+ return true;
1975
+ }
1976
+
1977
+ /**
1978
+ * Seek from the current document position to the first occurrence of a
1979
+ * character not defined by the provided string. Update the current document
1980
+ * position to the new position.
1981
+ *
1982
+ * @param string $chars A string containing every allowed character.
1983
+ * @return void
1984
+ */
1985
+ protected function skip($chars)
1986
+ {
1987
+ $this->pos += strspn($this->doc, $chars, $this->pos);
1988
+ $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1989
+ }
1990
+
1991
+ /**
1992
+ * Copy substring from the current document position to the first occurrence
1993
+ * of a character not defined by the provided string.
1994
+ *
1995
+ * @param string $chars A string containing every allowed character.
1996
+ * @return string Substring from the current document position to the first
1997
+ * occurrence of a character not defined by the provided string.
1998
+ */
1999
+ protected function copy_skip($chars)
2000
+ {
2001
+ $pos = $this->pos;
2002
+ $len = strspn($this->doc, $chars, $pos);
2003
+ $this->pos += $len;
2004
+ $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
2005
+ if ($len===0) return '';
2006
+ return substr($this->doc, $pos, $len);
2007
+ }
2008
+
2009
+ /**
2010
+ * Copy substring from the current document position to the first occurrence
2011
+ * of any of the provided characters.
2012
+ *
2013
+ * @param string $chars A string containing every character to stop at.
2014
+ * @return string Substring from the current document position to the first
2015
+ * occurrence of any of the provided characters.
2016
+ */
2017
+ protected function copy_until($chars)
2018
+ {
2019
+ $pos = $this->pos;
2020
+ $len = strcspn($this->doc, $chars, $pos);
2021
+ $this->pos += $len;
2022
+ $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
2023
+ return substr($this->doc, $pos, $len);
2024
+ }
2025
+
2026
+ /**
2027
+ * Copy substring from the current document position to the first occurrence
2028
+ * of the provided string.
2029
+ *
2030
+ * @param string $char The string to stop at.
2031
+ * @return string Substring from the current document position to the first
2032
+ * occurrence of the provided string.
2033
+ */
2034
+ protected function copy_until_char($char)
2035
+ {
2036
+ if ($this->char===null) return '';
2037
+
2038
+ if (($pos = strpos($this->doc, $char, $this->pos))===false) {
2039
+ $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
2040
+ $this->char = null;
2041
+ $this->pos = $this->size;
2042
+ return $ret;
2043
+ }
2044
+
2045
+ if ($pos===$this->pos) return '';
2046
+ $pos_old = $this->pos;
2047
+ $this->char = $this->doc[$pos];
2048
+ $this->pos = $pos;
2049
+ return substr($this->doc, $pos_old, $pos-$pos_old);
2050
+ }
2051
+
2052
+ /**
2053
+ * Remove noise from HTML content
2054
+ *
2055
+ * Noise is stored to {@see simple_html_dom::$noise}
2056
+ *
2057
+ * @param string $pattern The regex pattern used for finding noise
2058
+ * @param bool $remove_tag True to remove the entire match. Default is false
2059
+ * to only remove the captured data.
2060
+ */
2061
+ protected function remove_noise($pattern, $remove_tag=false)
2062
+ {
2063
+ global $debug_object;
2064
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2065
+
2066
+ $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
2067
+
2068
+ for ($i=$count-1; $i>-1; --$i)
2069
+ {
2070
+ $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
2071
+ if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
2072
+ $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2073
+ $this->noise[$key] = $matches[$i][$idx][0];
2074
+ $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2075
+ }
2076
+
2077
+ // reset the length of content
2078
+ $this->size = strlen($this->doc);
2079
+ if ($this->size>0)
2080
+ {
2081
+ $this->char = $this->doc[0];
2082
+ }
2083
+ }
2084
+
2085
+ /**
2086
+ * Restore noise to HTML content
2087
+ *
2088
+ * Noise is restored from {@see simple_html_dom::$noise}
2089
+ *
2090
+ * @param string $text A subset of HTML containing noise
2091
+ * @return string The same content with noise restored
2092
+ */
2093
+ function restore_noise($text)
2094
+ {
2095
+ global $debug_object;
2096
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2097
+
2098
+ while (($pos=strpos($text, '___noise___'))!==false)
2099
+ {
2100
+ // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
2101
+ if (strlen($text) > $pos+15)
2102
+ { // todo: "___noise___1000" (or any number with four or more digits) in the DOM causes an infinite loop which could be utilized by malicious software
2103
+ $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
2104
+ if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
2105
+
2106
+ if (isset($this->noise[$key]))
2107
+ {
2108
+ $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
2109
+ }
2110
+ else
2111
+ {
2112
+ // do this to prevent an infinite loop.
2113
+ $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
2114
+ }
2115
+ }
2116
+ else
2117
+ {
2118
+ // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
2119
+ $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
2120
+ }
2121
+ }
2122
+ return $text;
2123
+ }
2124
+
2125
+ // Sometimes we NEED one of the noise elements.
2126
+ function search_noise($text)
2127
+ {
2128
+ global $debug_object;
2129
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2130
+
2131
+ foreach($this->noise as $noiseElement)
2132
+ {
2133
+ if (strpos($noiseElement, $text)!==false)
2134
+ {
2135
+ return $noiseElement;
2136
+ }
2137
+ }
2138
+ }
2139
+ function __toString()
2140
+ {
2141
+ return $this->root->innertext();
2142
+ }
2143
+
2144
+ function __get($name)
2145
+ {
2146
+ switch ($name)
2147
+ {
2148
+ case 'outertext':
2149
+ return $this->root->innertext();
2150
+ case 'innertext':
2151
+ return $this->root->innertext();
2152
+ case 'plaintext':
2153
+ return $this->root->text();
2154
+ case 'charset':
2155
+ return $this->_charset;
2156
+ case 'target_charset':
2157
+ return $this->_target_charset;
2158
+ }
2159
+ }
2160
+
2161
+ // camel naming conventions
2162
+ function childNodes($idx=-1) {return $this->root->childNodes($idx);}
2163
+ function firstChild() {return $this->root->first_child();}
2164
+ function lastChild() {return $this->root->last_child();}
2165
+ function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
2166
+ function createTextNode($value) {return @end(str_get_html($value)->nodes);}
2167
+ function getElementById($id) {return $this->find("#$id", 0);}
2168
+ function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
2169
+ function getElementByTagName($name) {return $this->find($name, 0);}
2170
+ function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
2171
+ function loadFile() {$args = func_get_args();$this->load_file($args);}
2172
+ }
2173
+
2174
+ ?>