HTML.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. <?php
  2. if (!defined('PHPEXCEL_ROOT')) {
  3. /**
  4. * @ignore
  5. */
  6. define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../');
  7. require(PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php');
  8. }
  9. /**
  10. * PHPExcel_Reader_HTML
  11. *
  12. * Copyright (c) 2006 - 2015 PHPExcel
  13. *
  14. * This library is free software; you can redistribute it and/or
  15. * modify it under the terms of the GNU Lesser General Public
  16. * License as published by the Free Software Foundation; either
  17. * version 2.1 of the License, or (at your option) any later version.
  18. *
  19. * This library is distributed in the hope that it will be useful,
  20. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. * Lesser General Public License for more details.
  23. *
  24. * You should have received a copy of the GNU Lesser General Public
  25. * License along with this library; if not, write to the Free Software
  26. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. *
  28. * @category PHPExcel
  29. * @package PHPExcel_Reader
  30. * @copyright Copyright (c) 2006 - 2015 PHPExcel (http://www.codeplex.com/PHPExcel)
  31. * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL
  32. * @version ##VERSION##, ##DATE##
  33. */
  34. /** PHPExcel root directory */
  35. class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader
  36. {
  37. /**
  38. * Input encoding
  39. *
  40. * @var string
  41. */
  42. protected $inputEncoding = 'ANSI';
  43. /**
  44. * Sheet index to read
  45. *
  46. * @var int
  47. */
  48. protected $sheetIndex = 0;
  49. /**
  50. * Formats
  51. *
  52. * @var array
  53. */
  54. protected $formats = array(
  55. 'h1' => array(
  56. 'font' => array(
  57. 'bold' => true,
  58. 'size' => 24,
  59. ),
  60. ), // Bold, 24pt
  61. 'h2' => array(
  62. 'font' => array(
  63. 'bold' => true,
  64. 'size' => 18,
  65. ),
  66. ), // Bold, 18pt
  67. 'h3' => array(
  68. 'font' => array(
  69. 'bold' => true,
  70. 'size' => 13.5,
  71. ),
  72. ), // Bold, 13.5pt
  73. 'h4' => array(
  74. 'font' => array(
  75. 'bold' => true,
  76. 'size' => 12,
  77. ),
  78. ), // Bold, 12pt
  79. 'h5' => array(
  80. 'font' => array(
  81. 'bold' => true,
  82. 'size' => 10,
  83. ),
  84. ), // Bold, 10pt
  85. 'h6' => array(
  86. 'font' => array(
  87. 'bold' => true,
  88. 'size' => 7.5,
  89. ),
  90. ), // Bold, 7.5pt
  91. 'a' => array(
  92. 'font' => array(
  93. 'underline' => true,
  94. 'color' => array(
  95. 'argb' => PHPExcel_Style_Color::COLOR_BLUE,
  96. ),
  97. ),
  98. ), // Blue underlined
  99. 'hr' => array(
  100. 'borders' => array(
  101. 'bottom' => array(
  102. 'style' => PHPExcel_Style_Border::BORDER_THIN,
  103. 'color' => array(
  104. PHPExcel_Style_Color::COLOR_BLACK,
  105. ),
  106. ),
  107. ),
  108. ), // Bottom border
  109. );
  110. protected $rowspan = array();
  111. /**
  112. * Create a new PHPExcel_Reader_HTML
  113. */
  114. public function __construct()
  115. {
  116. $this->readFilter = new PHPExcel_Reader_DefaultReadFilter();
  117. }
  118. /**
  119. * Validate that the current file is an HTML file
  120. *
  121. * @return boolean
  122. */
  123. protected function isValidFormat()
  124. {
  125. // Reading 2048 bytes should be enough to validate that the format is HTML
  126. $data = fread($this->fileHandle, 2048);
  127. if ((strpos($data, '<') !== false) &&
  128. (strlen($data) !== strlen(strip_tags($data)))) {
  129. return true;
  130. }
  131. return false;
  132. }
  133. /**
  134. * Loads PHPExcel from file
  135. *
  136. * @param string $pFilename
  137. * @return PHPExcel
  138. * @throws PHPExcel_Reader_Exception
  139. */
  140. public function load($pFilename)
  141. {
  142. // Create new PHPExcel
  143. $objPHPExcel = new PHPExcel();
  144. // Load into this instance
  145. return $this->loadIntoExisting($pFilename, $objPHPExcel);
  146. }
  147. /**
  148. * Set input encoding
  149. *
  150. * @param string $pValue Input encoding
  151. */
  152. public function setInputEncoding($pValue = 'ANSI')
  153. {
  154. $this->inputEncoding = $pValue;
  155. return $this;
  156. }
  157. /**
  158. * Get input encoding
  159. *
  160. * @return string
  161. */
  162. public function getInputEncoding()
  163. {
  164. return $this->inputEncoding;
  165. }
  166. // Data Array used for testing only, should write to PHPExcel object on completion of tests
  167. protected $dataArray = array();
  168. protected $tableLevel = 0;
  169. protected $nestedColumn = array('A');
  170. protected function setTableStartColumn($column)
  171. {
  172. if ($this->tableLevel == 0) {
  173. $column = 'A';
  174. }
  175. ++$this->tableLevel;
  176. $this->nestedColumn[$this->tableLevel] = $column;
  177. return $this->nestedColumn[$this->tableLevel];
  178. }
  179. protected function getTableStartColumn()
  180. {
  181. return $this->nestedColumn[$this->tableLevel];
  182. }
  183. protected function releaseTableStartColumn()
  184. {
  185. --$this->tableLevel;
  186. return array_pop($this->nestedColumn);
  187. }
  188. protected function flushCell($sheet, $column, $row, &$cellContent)
  189. {
  190. if (is_string($cellContent)) {
  191. // Simple String content
  192. if (trim($cellContent) > '') {
  193. // Only actually write it if there's content in the string
  194. // echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />';
  195. // Write to worksheet to be done here...
  196. // ... we return the cell so we can mess about with styles more easily
  197. $sheet->setCellValue($column . $row, $cellContent, true);
  198. $this->dataArray[$row][$column] = $cellContent;
  199. }
  200. } else {
  201. // We have a Rich Text run
  202. // TODO
  203. $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
  204. }
  205. $cellContent = (string) '';
  206. }
  207. protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null)
  208. {
  209. foreach ($element->childNodes as $child) {
  210. if ($child instanceof DOMText) {
  211. $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
  212. if (is_string($cellContent)) {
  213. // simply append the text if the cell content is a plain text string
  214. $cellContent .= $domText;
  215. } else {
  216. // but if we have a rich text run instead, we need to append it correctly
  217. // TODO
  218. }
  219. } elseif ($child instanceof DOMElement) {
  220. // echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />';
  221. $attributeArray = array();
  222. foreach ($child->attributes as $attribute) {
  223. // echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />';
  224. $attributeArray[$attribute->name] = $attribute->value;
  225. }
  226. switch ($child->nodeName) {
  227. case 'meta':
  228. foreach ($attributeArray as $attributeName => $attributeValue) {
  229. switch ($attributeName) {
  230. case 'content':
  231. // TODO
  232. // Extract character set, so we can convert to UTF-8 if required
  233. break;
  234. }
  235. }
  236. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  237. break;
  238. case 'title':
  239. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  240. $sheet->setTitle($cellContent);
  241. $cellContent = '';
  242. break;
  243. case 'span':
  244. case 'div':
  245. case 'font':
  246. case 'i':
  247. case 'em':
  248. case 'strong':
  249. case 'b':
  250. // echo 'STYLING, SPAN OR DIV<br />';
  251. if ($cellContent > '') {
  252. $cellContent .= ' ';
  253. }
  254. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  255. if ($cellContent > '') {
  256. $cellContent .= ' ';
  257. }
  258. // echo 'END OF STYLING, SPAN OR DIV<br />';
  259. break;
  260. case 'hr':
  261. $this->flushCell($sheet, $column, $row, $cellContent);
  262. ++$row;
  263. if (isset($this->formats[$child->nodeName])) {
  264. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  265. } else {
  266. $cellContent = '----------';
  267. $this->flushCell($sheet, $column, $row, $cellContent);
  268. }
  269. ++$row;
  270. // Add a break after a horizontal rule, simply by allowing the code to dropthru
  271. case 'br':
  272. if ($this->tableLevel > 0) {
  273. // If we're inside a table, replace with a \n
  274. $cellContent .= "\n";
  275. } else {
  276. // Otherwise flush our existing content and move the row cursor on
  277. $this->flushCell($sheet, $column, $row, $cellContent);
  278. ++$row;
  279. }
  280. // echo 'HARD LINE BREAK: ' , '<br />';
  281. break;
  282. case 'a':
  283. // echo 'START OF HYPERLINK: ' , '<br />';
  284. foreach ($attributeArray as $attributeName => $attributeValue) {
  285. switch ($attributeName) {
  286. case 'href':
  287. // echo 'Link to ' , $attributeValue , '<br />';
  288. $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
  289. if (isset($this->formats[$child->nodeName])) {
  290. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  291. }
  292. break;
  293. }
  294. }
  295. $cellContent .= ' ';
  296. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  297. // echo 'END OF HYPERLINK:' , '<br />';
  298. break;
  299. case 'h1':
  300. case 'h2':
  301. case 'h3':
  302. case 'h4':
  303. case 'h5':
  304. case 'h6':
  305. case 'ol':
  306. case 'ul':
  307. case 'p':
  308. if ($this->tableLevel > 0) {
  309. // If we're inside a table, replace with a \n
  310. $cellContent .= "\n";
  311. // echo 'LIST ENTRY: ' , '<br />';
  312. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  313. // echo 'END OF LIST ENTRY:' , '<br />';
  314. } else {
  315. if ($cellContent > '') {
  316. $this->flushCell($sheet, $column, $row, $cellContent);
  317. $row++;
  318. }
  319. // echo 'START OF PARAGRAPH: ' , '<br />';
  320. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  321. // echo 'END OF PARAGRAPH:' , '<br />';
  322. $this->flushCell($sheet, $column, $row, $cellContent);
  323. if (isset($this->formats[$child->nodeName])) {
  324. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  325. }
  326. $row++;
  327. $column = 'A';
  328. }
  329. break;
  330. case 'li':
  331. if ($this->tableLevel > 0) {
  332. // If we're inside a table, replace with a \n
  333. $cellContent .= "\n";
  334. // echo 'LIST ENTRY: ' , '<br />';
  335. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  336. // echo 'END OF LIST ENTRY:' , '<br />';
  337. } else {
  338. if ($cellContent > '') {
  339. $this->flushCell($sheet, $column, $row, $cellContent);
  340. }
  341. ++$row;
  342. // echo 'LIST ENTRY: ' , '<br />';
  343. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  344. // echo 'END OF LIST ENTRY:' , '<br />';
  345. $this->flushCell($sheet, $column, $row, $cellContent);
  346. $column = 'A';
  347. }
  348. break;
  349. case 'table':
  350. $this->flushCell($sheet, $column, $row, $cellContent);
  351. $column = $this->setTableStartColumn($column);
  352. // echo 'START OF TABLE LEVEL ' , $this->tableLevel , '<br />';
  353. if ($this->tableLevel > 1) {
  354. --$row;
  355. }
  356. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  357. // echo 'END OF TABLE LEVEL ' , $this->tableLevel , '<br />';
  358. $column = $this->releaseTableStartColumn();
  359. if ($this->tableLevel > 1) {
  360. ++$column;
  361. } else {
  362. ++$row;
  363. }
  364. break;
  365. case 'thead':
  366. case 'tbody':
  367. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  368. break;
  369. case 'tr':
  370. $column = $this->getTableStartColumn();
  371. $cellContent = '';
  372. // echo 'START OF TABLE ' , $this->tableLevel , ' ROW<br />';
  373. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  374. ++$row;
  375. // echo 'END OF TABLE ' , $this->tableLevel , ' ROW<br />';
  376. break;
  377. case 'th':
  378. case 'td':
  379. // echo 'START OF TABLE ' , $this->tableLevel , ' CELL<br />';
  380. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  381. // echo 'END OF TABLE ' , $this->tableLevel , ' CELL<br />';
  382. while (isset($this->rowspan[$column . $row])) {
  383. ++$column;
  384. }
  385. $this->flushCell($sheet, $column, $row, $cellContent);
  386. // if (isset($attributeArray['style']) && !empty($attributeArray['style'])) {
  387. // $styleAry = $this->getPhpExcelStyleArray($attributeArray['style']);
  388. //
  389. // if (!empty($styleAry)) {
  390. // $sheet->getStyle($column . $row)->applyFromArray($styleAry);
  391. // }
  392. // }
  393. if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
  394. //create merging rowspan and colspan
  395. $columnTo = $column;
  396. for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
  397. ++$columnTo;
  398. }
  399. $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
  400. foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
  401. $this->rowspan[$value] = true;
  402. }
  403. $sheet->mergeCells($range);
  404. $column = $columnTo;
  405. } elseif (isset($attributeArray['rowspan'])) {
  406. //create merging rowspan
  407. $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
  408. foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
  409. $this->rowspan[$value] = true;
  410. }
  411. $sheet->mergeCells($range);
  412. } elseif (isset($attributeArray['colspan'])) {
  413. //create merging colspan
  414. $columnTo = $column;
  415. for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
  416. ++$columnTo;
  417. }
  418. $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
  419. $column = $columnTo;
  420. }
  421. ++$column;
  422. break;
  423. case 'body':
  424. $row = 1;
  425. $column = 'A';
  426. $content = '';
  427. $this->tableLevel = 0;
  428. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  429. break;
  430. default:
  431. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  432. }
  433. }
  434. }
  435. }
  436. /**
  437. * Loads PHPExcel from file into PHPExcel instance
  438. *
  439. * @param string $pFilename
  440. * @param PHPExcel $objPHPExcel
  441. * @return PHPExcel
  442. * @throws PHPExcel_Reader_Exception
  443. */
  444. public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
  445. {
  446. // Open file to validate
  447. $this->openFile($pFilename);
  448. if (!$this->isValidFormat()) {
  449. fclose($this->fileHandle);
  450. throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file.");
  451. }
  452. // Close after validating
  453. fclose($this->fileHandle);
  454. // Create new PHPExcel
  455. while ($objPHPExcel->getSheetCount() <= $this->sheetIndex) {
  456. $objPHPExcel->createSheet();
  457. }
  458. $objPHPExcel->setActiveSheetIndex($this->sheetIndex);
  459. // Create a new DOM object
  460. $dom = new domDocument;
  461. // Reload the HTML file into the DOM object
  462. $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
  463. if ($loaded === false) {
  464. throw new PHPExcel_Reader_Exception('Failed to load ' . $pFilename . ' as a DOM Document');
  465. }
  466. // Discard white space
  467. $dom->preserveWhiteSpace = false;
  468. $row = 0;
  469. $column = 'A';
  470. $content = '';
  471. $this->processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content);
  472. // Return
  473. return $objPHPExcel;
  474. }
  475. /**
  476. * Get sheet index
  477. *
  478. * @return int
  479. */
  480. public function getSheetIndex()
  481. {
  482. return $this->sheetIndex;
  483. }
  484. /**
  485. * Set sheet index
  486. *
  487. * @param int $pValue Sheet index
  488. * @return PHPExcel_Reader_HTML
  489. */
  490. public function setSheetIndex($pValue = 0)
  491. {
  492. $this->sheetIndex = $pValue;
  493. return $this;
  494. }
  495. /**
  496. * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks
  497. *
  498. * @param string $xml
  499. * @throws PHPExcel_Reader_Exception
  500. */
  501. public function securityScan($xml)
  502. {
  503. $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
  504. if (preg_match($pattern, $xml)) {
  505. throw new PHPExcel_Reader_Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
  506. }
  507. return $xml;
  508. }
  509. }