@@ -354,6 +354,85 @@ public function textContent(DOMElement $el) {
354
354
return $ clonedEl ->textContent ;
355
355
}
356
356
357
+ /**
358
+ * This method attempts to return a better 'innerText' representation than DOMNode::textContent
359
+ *
360
+ * @param DOMElement|DOMText $el
361
+ * @param bool $implied when parsing for implied name for h-*, rules may be slightly different
362
+ * @see: https://github.com./glennjones/microformat-shiv/blob/dev/lib/text.js
363
+ */
364
+ public function innerText ($ el , $ implied = false ) {
365
+ $ out = '' ;
366
+
367
+ $ blockLevelTags = array ('h1 ' , 'h2 ' , 'h3 ' , 'h4 ' , 'h5 ' , 'h6 ' , 'p ' , 'hr ' , 'pre ' , 'table ' ,
368
+ 'address ' , 'article ' , 'aside ' , 'blockquote ' , 'caption ' , 'col ' , 'colgroup ' , 'dd ' , 'div ' ,
369
+ 'dt ' , 'dir ' , 'fieldset ' , 'figcaption ' , 'figure ' , 'footer ' , 'form ' , 'header ' , 'hgroup ' , 'hr ' ,
370
+ 'li ' , 'map ' , 'menu ' , 'nav ' , 'optgroup ' , 'option ' , 'section ' , 'tbody ' , 'testarea ' ,
371
+ 'tfoot ' , 'th ' , 'thead ' , 'tr ' , 'td ' , 'ul ' , 'ol ' , 'dl ' , 'details ' );
372
+
373
+ $ excludeTags = array ('noframe ' , 'noscript ' , 'script ' , 'style ' , 'frames ' , 'frameset ' );
374
+
375
+ if (isset ($ el ->tagName ))
376
+ {
377
+
378
+ if (in_array (strtolower ($ el ->tagName ), $ excludeTags )) {
379
+ return $ out ;
380
+ }
381
+ else if ($ el ->tagName == 'img ' ) {
382
+ if ($ el ->getAttribute ('alt ' ) !== '' ) {
383
+ return $ el ->getAttribute ('alt ' );
384
+ }
385
+ else if (!$ implied && $ el ->getAttribute ('src ' ) !== '' ) {
386
+ {
387
+ return $ this ->resolveUrl ($ el ->getAttribute ('src ' ));
388
+ }
389
+ }
390
+ } else if ($ el ->tagName == 'area ' and $ el ->getAttribute ('alt ' ) !== '' ) {
391
+ return $ el ->getAttribute ('alt ' );
392
+ } else if ($ el ->tagName == 'abbr ' and $ el ->getAttribute ('title ' ) !== '' ) {
393
+ return $ el ->getAttribute ('title ' );
394
+ }
395
+
396
+ }
397
+
398
+ // if node is a text node get its text
399
+ if (isset ($ el ->nodeType ) && $ el ->nodeType === 3 ) {
400
+ $ out .= $ el ->textContent ;
401
+ }
402
+
403
+ // get the text of the child nodes
404
+ if ($ el ->childNodes && $ el ->childNodes ->length > 0 ) {
405
+
406
+ for ($ j = 0 ; $ j < $ el ->childNodes ->length ; $ j ++) {
407
+
408
+ $ text = $ this ->innerText ($ el ->childNodes ->item ($ j ), $ implied );
409
+
410
+ if ( !is_null ($ text ) )
411
+ {
412
+ $ out .= $ text ;
413
+ }
414
+
415
+ }
416
+ }
417
+
418
+ if (isset ($ el ->tagName )) {
419
+
420
+ // if its a block level tag add an additional space at the end
421
+ if (in_array (strtolower ($ el ->tagName ), $ blockLevelTags ))
422
+ {
423
+ $ out .= ' ' ;
424
+ }
425
+ // else if its a br, replace with newline
426
+ else if (strtolower ($ el ->tagName ) == 'br ' )
427
+ {
428
+ $ out .= "\n" ;
429
+ }
430
+
431
+ }
432
+
433
+ return ($ out === '' ) ? NULL : $ out ;
434
+ }
435
+
357
436
// TODO: figure out if this has problems with sms: and geo: URLs
358
437
public function resolveUrl ($ url ) {
359
438
// If the URL is seriously malformed it’s probably beyond the scope of this
@@ -413,7 +492,7 @@ public function parseValueClassTitle(\DOMElement $e, $separator = '') {
413
492
}
414
493
415
494
/**
416
- * Given an element with class="p-*", get it’s value
495
+ * Given an element with class="p-*", get its value
417
496
*
418
497
* @param DOMElement $p The element to parse
419
498
* @return string The plaintext value of $p, dependant on type
@@ -422,9 +501,12 @@ public function parseValueClassTitle(\DOMElement $e, $separator = '') {
422
501
public function parseP (\DOMElement $ p ) {
423
502
$ classTitle = $ this ->parseValueClassTitle ($ p , ' ' );
424
503
425
- if ($ classTitle !== null )
504
+ if ($ classTitle !== null ) {
426
505
return $ classTitle ;
506
+ }
427
507
508
+ $ this ->resolveChildUrls ($ p );
509
+
428
510
if ($ p ->tagName == 'img ' and $ p ->getAttribute ('alt ' ) !== '' ) {
429
511
$ pValue = $ p ->getAttribute ('alt ' );
430
512
} elseif ($ p ->tagName == 'area ' and $ p ->getAttribute ('alt ' ) !== '' ) {
@@ -434,7 +516,7 @@ public function parseP(\DOMElement $p) {
434
516
} elseif (in_array ($ p ->tagName , array ('data ' , 'input ' )) and $ p ->getAttribute ('value ' ) !== '' ) {
435
517
$ pValue = $ p ->getAttribute ('value ' );
436
518
} else {
437
- $ pValue = unicodeTrim ($ this ->textContent ($ p ));
519
+ $ pValue = unicodeTrim ($ this ->innerText ($ p ));
438
520
}
439
521
440
522
return $ pValue ;
@@ -809,7 +891,6 @@ public function parseH(\DOMElement $e) {
809
891
}
810
892
}
811
893
812
-
813
894
// Look for double nested img @alt
814
895
foreach ($ this ->xpath ->query ('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0] ' , $ e ) as $ em ) {
815
896
$ emNames = mfNamesFromElement ($ em , 'h- ' );
@@ -826,7 +907,7 @@ public function parseH(\DOMElement $e) {
826
907
}
827
908
}
828
909
829
- throw new Exception ($ e -> nodeValue );
910
+ throw new Exception ($ this -> innerText ( $ e , true ) );
830
911
} catch (Exception $ exc ) {
831
912
$ return ['name ' ][] = unicodeTrim ($ exc ->getMessage ());
832
913
}
0 commit comments