Skip to content

Implemented @glennjones "innerText" parsing for better parsed whitespace #82

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 86 additions & 5 deletions Mf2/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,85 @@ public function textContent(DOMElement $el) {
return $clonedEl->textContent;
}

/**
* This method attempts to return a better 'innerText' representation than DOMNode::textContent
*
* @param DOMElement|DOMText $el
* @param bool $implied when parsing for implied name for h-*, rules may be slightly different
* @see: https://github.com./glennjones/microformat-shiv/blob/dev/lib/text.js
*/
public function innerText($el, $implied = false) {
$out = '';

$blockLevelTags = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'pre', 'table',
'address', 'article', 'aside', 'blockquote', 'caption', 'col', 'colgroup', 'dd', 'div',
'dt', 'dir', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'header', 'hgroup', 'hr',
'li', 'map', 'menu', 'nav', 'optgroup', 'option', 'section', 'tbody', 'testarea',
'tfoot', 'th', 'thead', 'tr', 'td', 'ul', 'ol', 'dl', 'details');

$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');

if (isset($el->tagName))
{

if (in_array(strtolower($el->tagName), $excludeTags)) {
return $out;
}
else if ($el->tagName == 'img') {
if ($el->getAttribute('alt') !== '') {
return $el->getAttribute('alt');
}
else if (!$implied && $el->getAttribute('src') !== '') {
{
return $this->resolveUrl($el->getAttribute('src'));
}
}
} else if ($el->tagName == 'area' and $el->getAttribute('alt') !== '') {
return $el->getAttribute('alt');
} else if ($el->tagName == 'abbr' and $el->getAttribute('title') !== '') {
return $el->getAttribute('title');
}

}

// if node is a text node get its text
if (isset($el->nodeType) && $el->nodeType === 3) {
$out .= $el->textContent;
}

// get the text of the child nodes
if ($el->childNodes && $el->childNodes->length > 0) {

for ($j = 0; $j < $el->childNodes->length; $j++) {

$text = $this->innerText($el->childNodes->item($j), $implied);

if ( !is_null($text) )
{
$out .= $text;
}

}
}

if (isset($el->tagName)) {

// if its a block level tag add an additional space at the end
if (in_array(strtolower($el->tagName), $blockLevelTags))
{
$out .= ' ';
}
// else if its a br, replace with newline
else if (strtolower($el->tagName) == 'br')
{
$out .= "\n";
}

}

return ($out === '') ? NULL : $out;
}

// TODO: figure out if this has problems with sms: and geo: URLs
public function resolveUrl($url) {
// If the URL is seriously malformed it’s probably beyond the scope of this
Expand Down Expand Up @@ -413,7 +492,7 @@ public function parseValueClassTitle(\DOMElement $e, $separator = '') {
}

/**
* Given an element with class="p-*", get it’s value
* Given an element with class="p-*", get its value
*
* @param DOMElement $p The element to parse
* @return string The plaintext value of $p, dependant on type
Expand All @@ -422,9 +501,12 @@ public function parseValueClassTitle(\DOMElement $e, $separator = '') {
public function parseP(\DOMElement $p) {
$classTitle = $this->parseValueClassTitle($p, ' ');

if ($classTitle !== null)
if ($classTitle !== null) {
return $classTitle;
}

$this->resolveChildUrls($p);

if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
$pValue = $p->getAttribute('alt');
} elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
Expand All @@ -434,7 +516,7 @@ public function parseP(\DOMElement $p) {
} elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
$pValue = $p->getAttribute('value');
} else {
$pValue = unicodeTrim($this->textContent($p));
$pValue = unicodeTrim($this->innerText($p));
}

return $pValue;
Expand Down Expand Up @@ -809,7 +891,6 @@ public function parseH(\DOMElement $e) {
}
}


// Look for double nested img @alt
foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
$emNames = mfNamesFromElement($em, 'h-');
Expand All @@ -826,7 +907,7 @@ public function parseH(\DOMElement $e) {
}
}

throw new Exception($e->nodeValue);
throw new Exception($this->innerText($e, true));
} catch (Exception $exc) {
$return['name'][] = unicodeTrim($exc->getMessage());
}
Expand Down
13 changes: 13 additions & 0 deletions tests/Mf2/ParsePTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,17 @@ public function testConvertsNestedImgElementToAltOrSrc() {
$this->assertEquals('Blah blah http://waterpigs.co.uk/photos/five-legged-elephant.jpg', $result['items'][0]['properties']['summary'][0]);
}

/**
* @see https://github.com./indieweb/php-mf2/issues/69
*/
public function testBrWhitespaceIssue69() {
$input = '<div class="h-card"><p class="p-adr"><span class="p-street-address">Street Name 9</span><br/><span class="p-locality">12345 NY, USA</span></p></div>';
$result = Mf2\parse($input);

$this->assertEquals('Street Name 9' . "\n" . '12345 NY, USA', $result['items'][0]['properties']['adr'][0]);
$this->assertEquals('Street Name 9', $result['items'][0]['properties']['street-address'][0]);
$this->assertEquals('12345 NY, USA', $result['items'][0]['properties']['locality'][0]);
$this->assertEquals('Street Name 9' . "\n" . '12345 NY, USA', $result['items'][0]['properties']['name'][0]);
}

}
2 changes: 1 addition & 1 deletion tests/Mf2/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ public function testAreaTag() {
$parser = new Parser($input);
$output = $parser->parse();

$this->assertEquals('', $output['items'][0]['properties']['name'][0]);
$this->assertEquals('Person Bee', $output['items'][0]['properties']['name'][0]);
$this->assertEquals('rect', $output['items'][0]['properties']['category'][0]['shape']);
$this->assertEquals('100,100,120,120', $output['items'][0]['properties']['category'][0]['coords']);
$this->assertEquals('Person Bee', $output['items'][0]['properties']['category'][0]['value']);
Expand Down