From 9566d7460009932dd51697f2e0d01db1597039d0 Mon Sep 17 00:00:00 2001 From: Gregor Morrill Date: Sun, 21 Feb 2016 22:47:03 -0800 Subject: [PATCH 1/3] Added failing test for #69 --- tests/Mf2/ParsePTest.php | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/Mf2/ParsePTest.php b/tests/Mf2/ParsePTest.php index c4959b3..3ad4b81 100644 --- a/tests/Mf2/ParsePTest.php +++ b/tests/Mf2/ParsePTest.php @@ -102,4 +102,17 @@ public function testConvertsNestedImgElementToAltOrSrc() { $this->assertEquals('Blah blah http://waterpigs.co.uk/photos/five-legged-elephant.jpg', $result['items'][0]['properties']['summary'][0]); } + /** + * @see https://github.com/indieweb/php-mf2/issues/69 + */ + public function testBrWhitespaceIssue69() { + $input = '

Street Name 9
12345 NY, USA

'; + $result = Mf2\parse($input); + + $this->assertEquals('Street Name 9' . "\n" . '12345 NY, USA', $result['items'][0]['properties']['adr'][0]); + $this->assertEquals('Street Name 9', $result['items'][0]['properties']['street-address'][0]); + $this->assertEquals('12345 NY, USA', $result['items'][0]['properties']['locality'][0]); + $this->assertEquals('Street Name 9' . "\n" . '12345 NY, USA', $result['items'][0]['properties']['name'][0]); + } + } From cc3037200d33455bac522c423fb14d81478c1313 Mon Sep 17 00:00:00 2001 From: Gregor Morrill Date: Sun, 21 Feb 2016 22:47:52 -0800 Subject: [PATCH 2/3] Implemented @glennjones "innerText" parsing for better handling of block level elements and whitespace. --- Mf2/Parser.php | 98 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 93 insertions(+), 5 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 5df142a..8b7a10b 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -354,6 +354,87 @@ public function textContent(DOMElement $el) { return $clonedEl->textContent; } + /** + * This method attempts to return a better 'innerText' representation than DOMNode::textContent + * + * @param DOMElement|DOMText $el + * @param bool $implied when parsing for implied name for h-*, rules may be slightly different + * @see: https://github.com/glennjones/microformat-shiv/blob/dev/lib/text.js + */ + public function innerText($el, $implied = false) { + $out = ''; + + $blockLevelTags = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'pre', 'table', + 'address', 'article', 'aside', 'blockquote', 'caption', 'col', 'colgroup', 'dd', 'div', + 'dt', 'dir', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'header', 'hgroup', 'hr', + 'li', 'map', 'menu', 'nav', 'optgroup', 'option', 'section', 'tbody', 'testarea', + 'tfoot', 'th', 'thead', 'tr', 'td', 'ul', 'ol', 'dl', 'details'); + + $excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset'); + + if ( isset($el->tagName) ) + { + + if (in_array(strtolower($el->tagName), $excludeTags)) { + return $out; + } + else if ($el->tagName == 'img') { + if ($el->getAttribute('alt') !== '') { + return $el->getAttribute('alt'); + } + else if (!$implied && $el->getAttribute('src') !== '') { + { + return $this->resolveUrl($el->getAttribute('src')); + } + } + } else if ($el->tagName == 'area' and $el->getAttribute('alt') !== '') { + return $el->getAttribute('alt'); + } else if ($el->tagName == 'abbr' and $el->getAttribute('title') !== '') { + return $el->getAttribute('title'); + // } else if (in_array($el->tagName, array('data', 'input')) and $el->getAttribute('value') !== '') { + // return $el->getAttribute('value'); + } + + } + + // if node is a text node get its text + if ( isset($el->nodeType) && $el->nodeType === 3) { + $out .= $el->textContent; + } + + // get the text of the child nodes + if ($el->childNodes && $el->childNodes->length > 0) { + + for ($j = 0; $j < $el->childNodes->length; $j++) { + + $text = $this->innerText($el->childNodes->item($j), $implied); + + if ( !is_null($text) ) + { + $out .= $text; + } + + } + } + + if ( isset($el->tagName) ) { + + // if its a block level tag add an additional space at the end + if ( in_array(strtolower($el->tagName), $blockLevelTags) ) + { + $out .= ' '; + } + // else if its a br, replace with newline + else if ( strtolower($el->tagName) == 'br') + { + $out .= "\n"; + } + + } + + return ( $out === '' ) ? NULL : $out; + } + // TODO: figure out if this has problems with sms: and geo: URLs public function resolveUrl($url) { // If the URL is seriously malformed it’s probably beyond the scope of this @@ -413,7 +494,7 @@ public function parseValueClassTitle(\DOMElement $e, $separator = '') { } /** - * Given an element with class="p-*", get it’s value + * Given an element with class="p-*", get its value * * @param DOMElement $p The element to parse * @return string The plaintext value of $p, dependant on type @@ -422,9 +503,15 @@ public function parseValueClassTitle(\DOMElement $e, $separator = '') { public function parseP(\DOMElement $p) { $classTitle = $this->parseValueClassTitle($p, ' '); - if ($classTitle !== null) + if ($classTitle !== null) { return $classTitle; + } + $this->resolveChildUrls($p); + + // $pValue = unicodeTrim($this->innerText($p)); + // return $pValue; + if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') { $pValue = $p->getAttribute('alt'); } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') { @@ -434,10 +521,12 @@ public function parseP(\DOMElement $p) { } elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') { $pValue = $p->getAttribute('value'); } else { - $pValue = unicodeTrim($this->textContent($p)); + $pValue = unicodeTrim($this->innerText($p)); + // $pValue = unicodeTrim($this->textContent($p)); } return $pValue; + } /** @@ -809,7 +898,6 @@ public function parseH(\DOMElement $e) { } } - // Look for double nested img @alt foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) { $emNames = mfNamesFromElement($em, 'h-'); @@ -826,7 +914,7 @@ public function parseH(\DOMElement $e) { } } - throw new Exception($e->nodeValue); + throw new Exception($this->innerText($e, true)); } catch (Exception $exc) { $return['name'][] = unicodeTrim($exc->getMessage()); } From fceeea355f71703ad31aa56c8c748fe988ad4fc6 Mon Sep 17 00:00:00 2001 From: Gregor Morrill Date: Sun, 21 Feb 2016 22:56:25 -0800 Subject: [PATCH 3/3] Cleaned up my code. Updated test ParserTest::testAreaTag() with what I believe is the correct assertion. --- Mf2/Parser.php | 19 ++++++------------- tests/Mf2/ParserTest.php | 2 +- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 8b7a10b..b3b3b9b 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -372,7 +372,7 @@ public function innerText($el, $implied = false) { $excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset'); - if ( isset($el->tagName) ) + if (isset($el->tagName)) { if (in_array(strtolower($el->tagName), $excludeTags)) { @@ -391,14 +391,12 @@ public function innerText($el, $implied = false) { return $el->getAttribute('alt'); } else if ($el->tagName == 'abbr' and $el->getAttribute('title') !== '') { return $el->getAttribute('title'); - // } else if (in_array($el->tagName, array('data', 'input')) and $el->getAttribute('value') !== '') { - // return $el->getAttribute('value'); } } // if node is a text node get its text - if ( isset($el->nodeType) && $el->nodeType === 3) { + if (isset($el->nodeType) && $el->nodeType === 3) { $out .= $el->textContent; } @@ -417,22 +415,22 @@ public function innerText($el, $implied = false) { } } - if ( isset($el->tagName) ) { + if (isset($el->tagName)) { // if its a block level tag add an additional space at the end - if ( in_array(strtolower($el->tagName), $blockLevelTags) ) + if (in_array(strtolower($el->tagName), $blockLevelTags)) { $out .= ' '; } // else if its a br, replace with newline - else if ( strtolower($el->tagName) == 'br') + else if (strtolower($el->tagName) == 'br') { $out .= "\n"; } } - return ( $out === '' ) ? NULL : $out; + return ($out === '') ? NULL : $out; } // TODO: figure out if this has problems with sms: and geo: URLs @@ -508,9 +506,6 @@ public function parseP(\DOMElement $p) { } $this->resolveChildUrls($p); - - // $pValue = unicodeTrim($this->innerText($p)); - // return $pValue; if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') { $pValue = $p->getAttribute('alt'); @@ -522,11 +517,9 @@ public function parseP(\DOMElement $p) { $pValue = $p->getAttribute('value'); } else { $pValue = unicodeTrim($this->innerText($p)); - // $pValue = unicodeTrim($this->textContent($p)); } return $pValue; - } /** diff --git a/tests/Mf2/ParserTest.php b/tests/Mf2/ParserTest.php index 54a229d..9e76d5d 100644 --- a/tests/Mf2/ParserTest.php +++ b/tests/Mf2/ParserTest.php @@ -286,7 +286,7 @@ public function testAreaTag() { $parser = new Parser($input); $output = $parser->parse(); - $this->assertEquals('', $output['items'][0]['properties']['name'][0]); + $this->assertEquals('Person Bee', $output['items'][0]['properties']['name'][0]); $this->assertEquals('rect', $output['items'][0]['properties']['category'][0]['shape']); $this->assertEquals('100,100,120,120', $output['items'][0]['properties']['category'][0]['coords']); $this->assertEquals('Person Bee', $output['items'][0]['properties']['category'][0]['value']);