Skip to content

Commit 5d1425e

Browse files
committed
Merge pull request #82 from gRegorLove/issue69
Implemented @glennjones "innerText" parsing for better parsed whitespace
2 parents 66266ea + fceeea3 commit 5d1425e

File tree

3 files changed

+100
-6
lines changed

3 files changed

+100
-6
lines changed

Mf2/Parser.php

+86-5
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,85 @@ public function textContent(DOMElement $el) {
354354
return $clonedEl->textContent;
355355
}
356356

357+
/**
358+
* This method attempts to return a better 'innerText' representation than DOMNode::textContent
359+
*
360+
* @param DOMElement|DOMText $el
361+
* @param bool $implied when parsing for implied name for h-*, rules may be slightly different
362+
* @see: https://github.com./glennjones/microformat-shiv/blob/dev/lib/text.js
363+
*/
364+
public function innerText($el, $implied = false) {
365+
$out = '';
366+
367+
$blockLevelTags = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'pre', 'table',
368+
'address', 'article', 'aside', 'blockquote', 'caption', 'col', 'colgroup', 'dd', 'div',
369+
'dt', 'dir', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'header', 'hgroup', 'hr',
370+
'li', 'map', 'menu', 'nav', 'optgroup', 'option', 'section', 'tbody', 'testarea',
371+
'tfoot', 'th', 'thead', 'tr', 'td', 'ul', 'ol', 'dl', 'details');
372+
373+
$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');
374+
375+
if (isset($el->tagName))
376+
{
377+
378+
if (in_array(strtolower($el->tagName), $excludeTags)) {
379+
return $out;
380+
}
381+
else if ($el->tagName == 'img') {
382+
if ($el->getAttribute('alt') !== '') {
383+
return $el->getAttribute('alt');
384+
}
385+
else if (!$implied && $el->getAttribute('src') !== '') {
386+
{
387+
return $this->resolveUrl($el->getAttribute('src'));
388+
}
389+
}
390+
} else if ($el->tagName == 'area' and $el->getAttribute('alt') !== '') {
391+
return $el->getAttribute('alt');
392+
} else if ($el->tagName == 'abbr' and $el->getAttribute('title') !== '') {
393+
return $el->getAttribute('title');
394+
}
395+
396+
}
397+
398+
// if node is a text node get its text
399+
if (isset($el->nodeType) && $el->nodeType === 3) {
400+
$out .= $el->textContent;
401+
}
402+
403+
// get the text of the child nodes
404+
if ($el->childNodes && $el->childNodes->length > 0) {
405+
406+
for ($j = 0; $j < $el->childNodes->length; $j++) {
407+
408+
$text = $this->innerText($el->childNodes->item($j), $implied);
409+
410+
if ( !is_null($text) )
411+
{
412+
$out .= $text;
413+
}
414+
415+
}
416+
}
417+
418+
if (isset($el->tagName)) {
419+
420+
// if its a block level tag add an additional space at the end
421+
if (in_array(strtolower($el->tagName), $blockLevelTags))
422+
{
423+
$out .= ' ';
424+
}
425+
// else if its a br, replace with newline
426+
else if (strtolower($el->tagName) == 'br')
427+
{
428+
$out .= "\n";
429+
}
430+
431+
}
432+
433+
return ($out === '') ? NULL : $out;
434+
}
435+
357436
// TODO: figure out if this has problems with sms: and geo: URLs
358437
public function resolveUrl($url) {
359438
// If the URL is seriously malformed it’s probably beyond the scope of this
@@ -413,7 +492,7 @@ public function parseValueClassTitle(\DOMElement $e, $separator = '') {
413492
}
414493

415494
/**
416-
* Given an element with class="p-*", get it’s value
495+
* Given an element with class="p-*", get its value
417496
*
418497
* @param DOMElement $p The element to parse
419498
* @return string The plaintext value of $p, dependant on type
@@ -422,9 +501,12 @@ public function parseValueClassTitle(\DOMElement $e, $separator = '') {
422501
public function parseP(\DOMElement $p) {
423502
$classTitle = $this->parseValueClassTitle($p, ' ');
424503

425-
if ($classTitle !== null)
504+
if ($classTitle !== null) {
426505
return $classTitle;
506+
}
427507

508+
$this->resolveChildUrls($p);
509+
428510
if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
429511
$pValue = $p->getAttribute('alt');
430512
} elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
@@ -434,7 +516,7 @@ public function parseP(\DOMElement $p) {
434516
} elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
435517
$pValue = $p->getAttribute('value');
436518
} else {
437-
$pValue = unicodeTrim($this->textContent($p));
519+
$pValue = unicodeTrim($this->innerText($p));
438520
}
439521

440522
return $pValue;
@@ -809,7 +891,6 @@ public function parseH(\DOMElement $e) {
809891
}
810892
}
811893

812-
813894
// Look for double nested img @alt
814895
foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
815896
$emNames = mfNamesFromElement($em, 'h-');
@@ -826,7 +907,7 @@ public function parseH(\DOMElement $e) {
826907
}
827908
}
828909

829-
throw new Exception($e->nodeValue);
910+
throw new Exception($this->innerText($e, true));
830911
} catch (Exception $exc) {
831912
$return['name'][] = unicodeTrim($exc->getMessage());
832913
}

tests/Mf2/ParsePTest.php

+13
Original file line numberDiff line numberDiff line change
@@ -102,4 +102,17 @@ public function testConvertsNestedImgElementToAltOrSrc() {
102102
$this->assertEquals('Blah blah http://waterpigs.co.uk/photos/five-legged-elephant.jpg', $result['items'][0]['properties']['summary'][0]);
103103
}
104104

105+
/**
106+
* @see https://github.com./indieweb/php-mf2/issues/69
107+
*/
108+
public function testBrWhitespaceIssue69() {
109+
$input = '<div class="h-card"><p class="p-adr"><span class="p-street-address">Street Name 9</span><br/><span class="p-locality">12345 NY, USA</span></p></div>';
110+
$result = Mf2\parse($input);
111+
112+
$this->assertEquals('Street Name 9' . "\n" . '12345 NY, USA', $result['items'][0]['properties']['adr'][0]);
113+
$this->assertEquals('Street Name 9', $result['items'][0]['properties']['street-address'][0]);
114+
$this->assertEquals('12345 NY, USA', $result['items'][0]['properties']['locality'][0]);
115+
$this->assertEquals('Street Name 9' . "\n" . '12345 NY, USA', $result['items'][0]['properties']['name'][0]);
116+
}
117+
105118
}

tests/Mf2/ParserTest.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ public function testAreaTag() {
286286
$parser = new Parser($input);
287287
$output = $parser->parse();
288288

289-
$this->assertEquals('', $output['items'][0]['properties']['name'][0]);
289+
$this->assertEquals('Person Bee', $output['items'][0]['properties']['name'][0]);
290290
$this->assertEquals('rect', $output['items'][0]['properties']['category'][0]['shape']);
291291
$this->assertEquals('100,100,120,120', $output['items'][0]['properties']['category'][0]['coords']);
292292
$this->assertEquals('Person Bee', $output['items'][0]['properties']['category'][0]['value']);

0 commit comments

Comments
 (0)