source: github/program/lib/html2text.inc @ ab6f807

HEADcourier-fixdev-browser-capabilitiespdorelease-0.6release-0.7release-0.8
Last change on this file since ab6f807 was ab6f807, checked in by thomascube <thomas@…>, 6 years ago

Updated PEAR::Mail_mime package

  • Property mode set to 100644
File size: 14.3 KB
Line 
1<?php
2
3/*************************************************************************
4*                                                                       *
5* class.html2text.inc                                                   *
6*                                                                       *
7*************************************************************************
8*                                                                       *
9* Converts HTML to formatted plain text                                 *
10*                                                                       *
11* Copyright (c) 2005 Jon Abernathy <jon@chuggnutt.com>                  *
12* All rights reserved.                                                  *
13*                                                                       *
14* This script is free software; you can redistribute it and/or modify   *
15* it under the terms of the GNU General Public License as published by  *
16* the Free Software Foundation; either version 2 of the License, or     *
17* (at your option) any later version.                                   *
18*                                                                       *
19* The GNU General Public License can be found at                        *
20* http://www.gnu.org/copyleft/gpl.html.                                 *
21*                                                                       *
22* This script is distributed in the hope that it will be useful,        *
23* but WITHOUT ANY WARRANTY; without even the implied warranty of        *
24* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          *
25* GNU General Public License for more details.                          *
26*                                                                       *
27* Author(s): Jon Abernathy <jon@chuggnutt.com>                          *
28*                                                                       *
29* Last modified: 04/06/05                                               *
30* Modified: 2004/05/19 (tbr)                                            *
31*                                                                       *
32*************************************************************************/
33
34
35/**
36*  Takes HTML and converts it to formatted, plain text.
37*
38*  Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and
39*  correcting an error in the regexp search array. Fixed 7/30/03.
40*
41*  Updated set_html() function's file reading mechanism, 9/25/03.
42*
43*  Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding
44*  several more HTML entity codes to the $search and $replace arrays.
45*  Updated 11/7/03.
46*
47*  Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for
48*  suggesting the addition of $allowed_tags and its supporting function
49*  (which I slightly modified). Updated 3/12/04.
50*
51*  Thanks to Justin Dearing for pointing out that a replacement for the
52*  <TH> tag was missing, and suggesting an appropriate fix.
53*  Updated 8/25/04.
54*
55*  Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a
56*  display/formatting bug in the _build_link_list() function: email
57*  readers would show the left bracket and number ("[1") as part of the
58*  rendered email address.
59*  Updated 12/16/04.
60*
61*  Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code
62*  to handle relative links, which I hadn't considered. I modified his
63*  code a bit to handle normal HTTP links and MAILTO links. Also for
64*  suggesting three additional HTML entity codes to search for.
65*  Updated 03/02/05.
66*
67*  Thanks to Jacob Chandler for pointing out another link condition
68*  for the _build_link_list() function: "https".
69*  Updated 04/06/05.
70*
71*  @author Jon Abernathy <jon@chuggnutt.com>
72*  @version 0.6.1
73*  @since PHP 4.0.2
74*/
75class html2text
76{
77
78    /**
79     *  Contains the HTML content to convert.
80     *
81     *  @var string $html
82     *  @access public
83     */
84    var $html;
85
86    /**
87     *  Contains the converted, formatted text.
88     *
89     *  @var string $text
90     *  @access public
91     */
92    var $text;
93
94    /**
95     *  Maximum width of the formatted text, in columns.
96     *
97     *  @var integer $width
98     *  @access public
99     */
100    var $width = 70;
101
102    /**
103     *  List of preg* regular expression patterns to search for,
104     *  used in conjunction with $replace.
105     *
106     *  @var array $search
107     *  @access public
108     *  @see $replace
109     */
110    var $search = array(
111        "/\r/",                                  // Non-legal carriage return
112        "/[\n\t]+/",                             // Newlines and tabs
113        '/<script[^>]*>.*?<\/script>/i',         // <script>s -- which strip_tags supposedly has problems with
114        //'/<!-- .* -->/',                         // Comments -- which strip_tags might have problem a with
115        '/<a [^>]*href=("|\')([^"\']+)\1[^>]*>(.+?)<\/a>/ie', // <a href="">
116        '/<h[123][^>]*>(.+?)<\/h[123]>/ie',      // H1 - H3
117        '/<h[456][^>]*>(.+?)<\/h[456]>/ie',      // H4 - H6
118        '/<p[^>]*>/i',                           // <P>
119        '/<br[^>]*>/i',                          // <br>
120        '/<b[^>]*>(.+?)<\/b>/ie',                // <b>
121        '/<i[^>]*>(.+?)<\/i>/i',                 // <i>
122        '/(<ul[^>]*>|<\/ul>)/i',                 // <ul> and </ul>
123        '/(<ol[^>]*>|<\/ol>)/i',                 // <ol> and </ol>
124        '/<li[^>]*>/i',                          // <li>
125        '/<hr[^>]*>/i',                          // <hr>
126        '/(<table[^>]*>|<\/table>)/i',           // <table> and </table>
127        '/(<tr[^>]*>|<\/tr>)/i',                 // <tr> and </tr>
128        '/<td[^>]*>(.+?)<\/td>/i',               // <td> and </td>
129        '/<th[^>]*>(.+?)<\/th>/ie',              // <th> and </th>
130        '/&nbsp;/i',
131        '/&quot;/i',
132        '/&gt;/i',
133        '/&lt;/i',
134        '/&(amp|#38);/i',
135        '/&copy;/i',
136        '/&trade;/i',
137        '/&#8220;/',
138        '/&#8221;/',
139        '/&#8211;/',
140        '/&#(8217|39);/',
141        '/&#169;/',
142        '/&#8482;/',
143        '/&#151;/',
144        '/&#147;/',
145        '/&#148;/',
146        '/&#149;/',
147        '/&reg;/i',
148        '/&bull;/i',
149        '/&[&;]+;/i'
150    );
151
152    /**
153     *  List of pattern replacements corresponding to patterns searched.
154     *
155     *  @var array $replace
156     *  @access public
157     *  @see $search
158     */
159    var $replace = array(
160        '',                                     // Non-legal carriage return
161        ' ',                                    // Newlines and tabs
162        '',                                     // <script>s -- which strip_tags supposedly has problems with
163        //'',                                  // Comments -- which strip_tags might have problem a with
164        '$this->_build_link_list("\\2", "\\3")', // <a href="">
165        "strtoupper(\"\n\n\\1\n\n\")",          // H1 - H3
166        "ucwords(\"\n\n\\1\n\")",               // H4 - H6
167        "\n\n",                                 // <P>
168        "\n",                                   // <br>
169        'strtoupper("\\1")',                    // <b>
170        '_\\1_',                                // <i>
171        "\n\n",                                 // <ul> and </ul>
172        "\n\n",                                 // <ol> and </ol>
173        "\t*",                                  // <li>
174        "\n-------------------------\n",        // <hr>
175        "\n\n",                                 // <table> and </table>
176        "\n",                                   // <tr> and </tr>
177        "\t\t\\1\n",                            // <td> and </td>
178        "strtoupper(\"\t\t\\1\n\")",            // <th> and </th>
179        ' ',
180        '"',
181        '>',
182        '<',
183        '&',
184        '(c)',
185        '(tm)',
186        '"',
187        '"',
188        '-',
189        "'",
190        '(c)',
191        '(tm)',
192        '--',
193        '"',
194        '"',
195        '*',
196        '(R)',
197        '*',
198        ''
199    );
200
201    /**
202     *  Contains a list of HTML tags to allow in the resulting text.
203     *
204     *  @var string $allowed_tags
205     *  @access public
206     *  @see set_allowed_tags()
207     */
208    var $allowed_tags = '';
209
210    /**
211     *  Contains the base URL that relative links should resolve to.
212     *
213     *  @var string $url
214     *  @access public
215     */
216    var $url;
217
218    /**
219     *  Indicates whether content in the $html variable has been converted yet.
220     *
221     *  @var boolean $converted
222     *  @access private
223     *  @see $html, $text
224     */
225    var $_converted = false;
226
227    /**
228     *  Contains URL addresses from links to be rendered in plain text.
229     *
230     *  @var string $link_list
231     *  @access private
232     *  @see _build_link_list()
233     */
234    var $_link_list = array();
235   
236    /**
237     * Boolean flag, true if a table of link URLs should be listed after the text.
238     *
239     * @var boolean $_do_links
240     * @access private
241     * @see html2text()
242     */
243    var $_do_links = true;
244
245    /**
246     *  Constructor.
247     *
248     *  If the HTML source string (or file) is supplied, the class
249     *  will instantiate with that source propagated, all that has
250     *  to be done it to call get_text().
251     *
252     *  @param string $source HTML content
253     *  @param boolean $from_file Indicates $source is a file to pull content from
254     *  @param boolean $do_link_table indicate whether a table of link URLs is desired
255     *  @access public
256     *  @return void
257     */
258    function html2text( $source = '', $from_file = false, $produce_link_table = true )
259    {
260        if ( !empty($source) ) {
261            $this->set_html($source, $from_file);
262        }
263        $this->set_base_url();
264        $this->_do_links = $produce_link_table;
265    }
266
267    /**
268     *  Loads source HTML into memory, either from $source string or a file.
269     *
270     *  @param string $source HTML content
271     *  @param boolean $from_file Indicates $source is a file to pull content from
272     *  @access public
273     *  @return void
274     */
275    function set_html( $source, $from_file = false )
276    {
277        $this->html = $source;
278
279        if ( $from_file && file_exists($source) ) {
280            $fp = fopen($source, 'r');
281            $this->html = fread($fp, filesize($source));
282            fclose($fp);
283        }
284
285        $this->_converted = false;
286    }
287
288    /**
289     *  Returns the text, converted from HTML.
290     *
291     *  @access public
292     *  @return string
293     */
294    function get_text()
295    {
296        if ( !$this->_converted ) {
297            $this->_convert();
298        }
299
300        return $this->text;
301    }
302
303    /**
304     *  Prints the text, converted from HTML.
305     *
306     *  @access public
307     *  @return void
308     */
309    function print_text()
310    {
311        print $this->get_text();
312    }
313
314    /**
315     *  Alias to print_text(), operates identically.
316     *
317     *  @access public
318     *  @return void
319     *  @see print_text()
320     */
321    function p()
322    {
323        print $this->get_text();
324    }
325
326    /**
327     *  Sets the allowed HTML tags to pass through to the resulting text.
328     *
329     *  Tags should be in the form "<p>", with no corresponding closing tag.
330     *
331     *  @access public
332     *  @return void
333     */
334    function set_allowed_tags( $allowed_tags = '' )
335    {
336        if ( !empty($allowed_tags) ) {
337            $this->allowed_tags = $allowed_tags;
338        }
339    }
340
341    /**
342     *  Sets a base URL to handle relative links.
343     *
344     *  @access public
345     *  @return void
346     */
347    function set_base_url( $url = '' )
348    {
349        if ( empty($url) ) {
350            $this->url = 'http://' . $_SERVER['HTTP_HOST'];
351        } else {
352            // Strip any trailing slashes for consistency (relative
353            // URLs may already start with a slash like "/file.html")
354            if ( substr($url, -1) == '/' ) {
355                $url = substr($url, 0, -1);
356            }
357            $this->url = $url;
358        }
359    }
360
361    /**
362     *  Workhorse function that does actual conversion.
363     *
364     *  First performs custom tag replacement specified by $search and
365     *  $replace arrays. Then strips any remaining HTML tags, reduces whitespace
366     *  and newlines to a readable format, and word wraps the text to
367     *  $width characters.
368     *
369     *  @access private
370     *  @return void
371     */
372    function _convert()
373    {
374        // Variables used for building the link list
375        //$link_count = 1;
376        //$this->_link_list = '';
377
378        $text = trim(stripslashes($this->html));
379
380        // Run our defined search-and-replace
381        $text = preg_replace($this->search, $this->replace, $text);
382
383        // Strip any other HTML tags
384        $text = strip_tags($text, $this->allowed_tags);
385
386        // Bring down number of empty lines to 2 max
387        $text = preg_replace("/\n\s+\n/", "\n", $text);
388        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
389
390        // Add link list
391        if ( sizeof($this->_link_list) ) {
392            $text .= "\n\nLinks:\n------\n";
393            foreach ($this->_link_list as $id => $link) {
394                $text .= '[' . ($id+1) . '] ' . $link . "\n";
395            }
396        }
397
398        // Wrap the text to a readable format
399        // for PHP versions >= 4.0.2. Default width is 75
400        $text = wordwrap($text, $this->width);
401
402        $this->text = $text;
403
404        $this->_converted = true;
405    }
406
407    /**
408     *  Helper function called by preg_replace() on link replacement.
409     *
410     *  Maintains an internal list of links to be displayed at the end of the
411     *  text, with numeric indices to the original point in the text they
412     *  appeared. Also makes an effort at identifying and handling absolute
413     *  and relative links.
414     *
415     *  @param integer $link_count Counter tracking current link number
416     *  @param string $link URL of the link
417     *  @param string $display Part of the text to associate number with
418     *  @access private
419     *  @return string
420    */
421    function _build_link_list($link, $display)
422      {
423      if (! $this->_do_links) return $display;
424
425      $link_lc = strtolower($link);
426     
427      if (substr($link_lc, 0, 7) == 'http://' || substr($link_lc, 0, 8) == 'https://' || substr($link_lc, 0, 7) == 'mailto:')
428        {
429        $url = $link;
430        }
431      else
432        {
433        $url = $this->url;
434        if ($link{0} != '/') {
435             $url .= '/';
436            }
437        $url .= $link;
438        }
439
440      $index = array_search($url, $this->_link_list);
441      if ($index===FALSE)
442        {
443        $index = sizeof($this->_link_list);
444        $this->_link_list[$index] = $url;
445        }
446             
447      return $display . ' [' . ($index+1) . ']';
448      }
449}
450
451?>
Note: See TracBrowser for help on using the repository browser.