Skip to content

Commit 1ac6609

Browse files
authored
Merge pull request #140 from gRegorLove/parsing-issue11
Improve recursive parsing
2 parents 351ea23 + d65fbfd commit 1ac6609

File tree

4 files changed

+361
-80
lines changed

4 files changed

+361
-80
lines changed

Mf2/Parser.php

+163-80
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,9 @@ class Parser {
302302
/** @var SplObjectStorage */
303303
protected $parsed;
304304

305+
/**
306+
* @var bool
307+
*/
305308
public $jsonMode;
306309

307310
/** @var boolean Whether to include experimental language parsing in the result */
@@ -316,6 +319,11 @@ class Parser {
316319
*/
317320
protected $upgraded;
318321

322+
/**
323+
* Whether to convert classic microformats
324+
* @var bool
325+
*/
326+
public $convertClassic;
319327

320328
/**
321329
* Constructor
@@ -931,74 +939,23 @@ public function parseH(\DOMElement $e, $is_backcompat = false) {
931939
$return = array();
932940
$children = array();
933941
$dates = array();
942+
$prefixes = array();
934943
$impliedTimezone = null;
935944

936-
// each rel-bookmark with an href attribute
937-
foreach ( $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $e) as $el )
938-
{
939-
$class = 'u-url';
940-
// rel-bookmark already has class attribute; append current value
941-
if ($el->hasAttribute('class')) {
942-
$class .= ' ' . $el->getAttribute('class');
943-
}
944-
$el->setAttribute('class', $class);
945-
}
946-
947-
$subMFs = $this->getRootMF($e);
948-
949-
// Handle nested microformats (h-*)
950-
foreach ( $subMFs as $subMF ) {
951-
952-
// Parse
953-
$result = $this->parseH($subMF);
954-
955-
// If result was already parsed, skip it
956-
if (null === $result) {
957-
continue;
958-
}
959-
960-
// Does this µf have any property names other than h-*?
961-
$properties = nestedMfPropertyNamesFromElement($subMF);
962-
963-
if (!empty($properties)) {
964-
// Yes! It’s a nested property µf
965-
foreach ($properties as $property => $prefixes) {
966-
// Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
967-
$prefixSpecificResult = $result;
968-
if (in_array('p-', $prefixes)) {
969-
$prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0];
970-
} elseif (in_array('e-', $prefixes)) {
971-
$eParsedResult = $this->parseE($subMF);
972-
$prefixSpecificResult['html'] = $eParsedResult['html'];
973-
$prefixSpecificResult['value'] = $eParsedResult['value'];
974-
} elseif (in_array('u-', $prefixes)) {
975-
$prefixSpecificResult['value'] = (empty($result['properties']['url'])) ? $this->parseU($subMF) : reset($result['properties']['url']);
976-
}
977-
$return[$property][] = $prefixSpecificResult;
978-
}
979-
} else {
980-
// No, it’s a child µf
981-
$children[] = $result;
982-
}
983-
984-
// Make sure this sub-mf won’t get parsed as a µf or property
985-
// TODO: Determine if clearing this is required?
986-
$this->elementPrefixParsed($subMF, 'h');
987-
$this->elementPrefixParsed($subMF, 'p');
988-
$this->elementPrefixParsed($subMF, 'u');
989-
$this->elementPrefixParsed($subMF, 'dt');
990-
$this->elementPrefixParsed($subMF, 'e');
991-
}
992-
993945
if($e->tagName == 'area') {
994946
$coords = $e->getAttribute('coords');
995947
$shape = $e->getAttribute('shape');
996948
}
997949

998950
// Handle p-*
999951
foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
952+
// element is already parsed
1000953
if ($this->isElementParsed($p, 'p')) {
1001954
continue;
955+
// backcompat parsing and element was not upgraded; skip it
956+
} else if ( $is_backcompat && empty($this->upgraded[$p]) ) {
957+
$this->elementPrefixParsed($p, 'p');
958+
continue;
1002959
}
1003960

1004961
$pValue = $this->parseP($p);
@@ -1016,8 +973,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) {
1016973

1017974
// Handle u-*
1018975
foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
976+
// element is already parsed
1019977
if ($this->isElementParsed($u, 'u')) {
1020978
continue;
979+
// backcompat parsing and element was not upgraded; skip it
980+
} else if ( $is_backcompat && empty($this->upgraded[$u]) ) {
981+
$this->elementPrefixParsed($u, 'u');
982+
continue;
1021983
}
1022984

1023985
$uValue = $this->parseU($u);
@@ -1035,8 +997,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) {
1035997

1036998
// Handle dt-*
1037999
foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
1000+
// element is already parsed
10381001
if ($this->isElementParsed($dt, 'dt')) {
10391002
continue;
1003+
// backcompat parsing and element was not upgraded; skip it
1004+
} else if ( $is_backcompat && empty($this->upgraded[$dt]) ) {
1005+
$this->elementPrefixParsed($dt, 'dt');
1006+
continue;
10401007
}
10411008

10421009
$dtValue = $this->parseDT($dt, $dates, $impliedTimezone);
@@ -1064,8 +1031,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) {
10641031

10651032
// Handle e-*
10661033
foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
1034+
// element is already parsed
10671035
if ($this->isElementParsed($em, 'e')) {
10681036
continue;
1037+
// backcompat parsing and element was not upgraded; skip it
1038+
} else if ( $is_backcompat && empty($this->upgraded[$em]) ) {
1039+
$this->elementPrefixParsed($em, 'e');
1040+
continue;
10691041
}
10701042

10711043
$eValue = $this->parseE($em);
@@ -1333,32 +1305,16 @@ public function parseRelsAndAlternates() {
13331305
return array($rels, $rel_urls, $alternates);
13341306
}
13351307

1308+
13361309
/**
13371310
* Kicks off the parsing routine
1338-
*
1339-
* If `$htmlSafe` is set, any angle brackets in the results from non e-* properties
1340-
* will be HTML-encoded, bringing all output to the same level of encoding.
1341-
*
1342-
* If a DOMElement is set as the $context, only descendants of that element will
1343-
* be parsed for microformats.
1344-
*
1345-
* @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false
1346-
* @param DOMElement $context optionally an element from which to parse microformats
1347-
* @return array An array containing all the µfs found in the current document
1311+
* @param bool $convertClassic whether to do backcompat parsing on microformats1. Defaults to true.
1312+
* @param DOMElement $context optionally specify an element from which to parse microformats
1313+
* @return array An array containing all the microformats found in the current document
13481314
*/
13491315
public function parse($convertClassic = true, DOMElement $context = null) {
1350-
$mfs = array();
1351-
$mfElements = $this->getRootMF($context);
1352-
1353-
foreach ($mfElements as $node) {
1354-
$is_backcompat = !$this->hasRootMf2($node);
1355-
1356-
if ( $convertClassic && $is_backcompat ) {
1357-
$this->backcompat($node);
1358-
}
1359-
1360-
$mfs[] = $this->parseH($node, $is_backcompat);
1361-
}
1316+
$this->convertClassic = $convertClassic;
1317+
$mfs = $this->parse_recursive($context);
13621318

13631319
// Parse rels
13641320
list($rels, $rel_urls, $alternates) = $this->parseRelsAndAlternates();
@@ -1376,6 +1332,122 @@ public function parse($convertClassic = true, DOMElement $context = null) {
13761332
return $top;
13771333
}
13781334

1335+
1336+
/**
1337+
* Parse microformats recursively
1338+
* Keeps track of whether inside a backcompat root or not
1339+
* @param DOMElement $context: node to start with
1340+
* @param int $depth: recusion depth
1341+
* @return array
1342+
*/
1343+
public function parse_recursive(DOMElement $context = null, $depth = 0) {
1344+
$mfs = array();
1345+
$children = array();
1346+
$properties = array();
1347+
$mfElements = $this->getRootMF($context);
1348+
$result = array();
1349+
1350+
foreach ($mfElements as $node) {
1351+
$merge_properties = array();
1352+
$children = array();
1353+
1354+
$is_backcompat = !$this->hasRootMf2($node);
1355+
1356+
if ( $this->convertClassic && $is_backcompat ) {
1357+
$this->backcompat($node);
1358+
}
1359+
1360+
$recurse = $this->parse_recursive($node, ++$depth);
1361+
1362+
// recursion returned parsed result
1363+
if ( !empty($recurse) ) {
1364+
1365+
// parsed result is an mf root
1366+
if ( is_numeric(key($recurse)) ) {
1367+
1368+
// nested mf
1369+
if ( $depth > 0 ) {
1370+
$children = $recurse;
1371+
// top-level mf
1372+
} else {
1373+
$mfs = array_merge_recursive($mfs, $recurse);
1374+
}
1375+
1376+
// parsed result is an mf property
1377+
} else {
1378+
$merge_properties = $recurse;
1379+
}
1380+
1381+
}
1382+
1383+
// parse for root mf
1384+
$result = $this->parseH($node, $is_backcompat);
1385+
1386+
// merge nested mf properties
1387+
if ( $merge_properties && isset($result['properties']) ) {
1388+
$result['properties'] = array_merge($result['properties'], $merge_properties);
1389+
}
1390+
1391+
// parseH returned a parsed result
1392+
if ( $result ) {
1393+
1394+
// currently a nested mf; check if node is an mf property of parent
1395+
if ( $depth > 0 ) {
1396+
$temp_properties = nestedMfPropertyNamesFromElement($node);
1397+
1398+
// properties found; set up parsed result in $properties
1399+
if ( !empty($temp_properties) ) {
1400+
1401+
foreach ($temp_properties as $property => $prefixes) {
1402+
// Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
1403+
$prefixSpecificResult = $result;
1404+
if (in_array('p-', $prefixes)) {
1405+
$prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0];
1406+
} elseif (in_array('e-', $prefixes)) {
1407+
$eParsedResult = $this->parseE($node);
1408+
$prefixSpecificResult['html'] = $eParsedResult['html'];
1409+
$prefixSpecificResult['value'] = $eParsedResult['value'];
1410+
} elseif (in_array('u-', $prefixes)) {
1411+
$prefixSpecificResult['value'] = (empty($result['properties']['url'])) ? $this->parseU($node) : reset($result['properties']['url']);
1412+
}
1413+
1414+
if ( $children ) {
1415+
$prefixSpecificResult['children'] = $children;
1416+
}
1417+
1418+
$properties[$property][] = $prefixSpecificResult;
1419+
}
1420+
1421+
}
1422+
1423+
// TODO: Determine if clearing this is required?
1424+
$this->elementPrefixParsed($node, 'h');
1425+
$this->elementPrefixParsed($node, 'p');
1426+
$this->elementPrefixParsed($node, 'u');
1427+
$this->elementPrefixParsed($node, 'dt');
1428+
$this->elementPrefixParsed($node, 'e');
1429+
}
1430+
1431+
// add children mf from recursion
1432+
if ( $children ) {
1433+
$result['children'] = $children;
1434+
}
1435+
1436+
$mfs[] = $result;
1437+
}
1438+
1439+
}
1440+
1441+
// node is an mf property of parent, return $properties which has property name(s) as array indices
1442+
if ( $properties && ($depth > 1) ) {
1443+
return $properties;
1444+
}
1445+
1446+
// otherwise, return $mfs which has numeric array indices
1447+
return $mfs;
1448+
}
1449+
1450+
13791451
/**
13801452
* Parse From ID
13811453
*
@@ -1413,7 +1485,7 @@ public function getRootMF(DOMElement $context = null) {
14131485

14141486
// add mf1 root class names
14151487
foreach ( $this->classicRootMap as $old => $new ) {
1416-
$xpaths[] = '( contains(concat(" ",normalize-space(@class), " "), " ' . $old . ' ") and not(ancestor::*[contains(concat(" ",normalize-space(@class)), " h-")]) )';
1488+
$xpaths[] = '( contains(concat(" ",normalize-space(@class), " "), " ' . $old . ' ") )';
14171489
}
14181490

14191491
// final xpath with OR
@@ -1448,6 +1520,17 @@ public function backcompat(DOMElement $el, $context = '', $isParentMf2 = false)
14481520
// special handling for specific properties
14491521
switch ( $classname )
14501522
{
1523+
case 'hentry':
1524+
$rel_bookmark = $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $el);
1525+
1526+
if ( $rel_bookmark->length ) {
1527+
foreach ( $rel_bookmark as $tempEl ) {
1528+
$this->addMfClasses($tempEl, 'u-url');
1529+
$this->addUpgraded($tempEl, array('bookmark'));
1530+
}
1531+
}
1532+
break;
1533+
14511534
case 'hreview':
14521535
$item_and_vcard = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " vcard ")]', $el);
14531536

0 commit comments

Comments
 (0)