// SPDX-FileCopyrightText: 2004-2023 Ryan Parman, Sam Sneddon, Ryan McCue
// SPDX-License-Identifier: BSD-3-Clause
use InvalidArgumentException;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestFactoryInterface;
use Psr\Http\Message\UriFactoryInterface;
use SimplePie\Cache\Base;
use SimplePie\Cache\BaseDataCache;
use SimplePie\Cache\CallableNameFilter;
use SimplePie\Cache\DataCache;
use SimplePie\Cache\NameFilter;
use SimplePie\HTTP\Client;
use SimplePie\HTTP\ClientException;
use SimplePie\HTTP\FileClient;
use SimplePie\HTTP\Psr18Client;
* Used for data cleanup and post-processing
* This class can be overloaded with {@see \SimplePie\SimplePie::set_sanitize_class()}
* @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
class Sanitize implements RegistryAware
public $remove_div = true;
public $image_handler = '';
public $strip_htmltags = ['base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'];
public $encode_instead_of_strip = false;
public $strip_attributes = ['bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'];
public $rename_attributes = [];
/** @var array<string, array<string, string>> */
public $add_attributes = ['audio' => ['preload' => 'none'], 'iframe' => ['sandbox' => 'allow-scripts allow-same-origin'], 'video' => ['preload' => 'none']];
public $strip_comments = false;
public $output_encoding = 'UTF-8';
public $enable_cache = true;
public $cache_location = './cache';
/** @var string&(callable(string): string) */
public $cache_name_function = 'md5';
private $cache_namefilter;
public $force_fsockopen = false;
/** @var array<string, string|string[]> */
public $replace_url_attributes = [];
* @var array<int, mixed> Custom curl options
* @see SimplePie::set_curl_options()
private $curl_options = [];
* @var int Cache duration (in seconds)
private $cache_duration = 3600;
* List of domains for which to force HTTPS.
* @see \SimplePie\Sanitize::set_https_domains()
* Array is a tree split at DNS levels. Example:
* array('biz' => true, 'com' => array('example' => true), 'net' => array('example' => array('www' => true)))
* @var true|array<string, true|array<string, true|array<string, array<string, true|array<string, true|array<string, true>>>>>>
public $https_domains = [];
private $http_client = null;
public function __construct()
$this->set_url_replacements(null);
public function remove_div(bool $enable = true)
$this->remove_div = (bool) $enable;
* @param string|false $page
public function set_image_handler($page = false)
$this->image_handler = (string) $page;
$this->image_handler = '';
public function set_registry(\SimplePie\Registry $registry)
$this->registry = $registry;
* @param (string&(callable(string): string))|NameFilter $cache_name_function
* @param class-string<Cache> $cache_class
public function pass_cache_data(bool $enable_cache = true, string $cache_location = './cache', $cache_name_function = 'md5', string $cache_class = Cache::class, ?DataCache $cache = null)
$this->enable_cache = $enable_cache;
$this->cache_location = $cache_location;
// @phpstan-ignore-next-line Enforce PHPDoc type.
if (!is_string($cache_name_function) && !$cache_name_function instanceof NameFilter) {
throw new InvalidArgumentException(sprintf(
'%s(): Argument #3 ($cache_name_function) must be of type %s',
// BC: $cache_name_function could be a callable as string
if (is_string($cache_name_function)) {
// trigger_error(sprintf('Providing $cache_name_function as string in "%s()" is deprecated since SimplePie 1.8.0, provide as "%s" instead.', __METHOD__, NameFilter::class), \E_USER_DEPRECATED);
$this->cache_name_function = $cache_name_function;
$cache_name_function = new CallableNameFilter($cache_name_function);
$this->cache_namefilter = $cache_name_function;
* Set a PSR-18 client and PSR-17 factories
* Allows you to use your own HTTP client implementations.
final public function set_http_client(
ClientInterface $http_client,
RequestFactoryInterface $request_factory,
UriFactoryInterface $uri_factory
$this->http_client = new Psr18Client($http_client, $request_factory, $uri_factory);
* @deprecated since SimplePie 1.9.0, use \SimplePie\Sanitize::set_http_client() instead.
* @param class-string<File> $file_class
* @param array<int, mixed> $curl_options
public function pass_file_data(string $file_class = File::class, int $timeout = 10, string $useragent = '', bool $force_fsockopen = false, array $curl_options = [])
// trigger_error(sprintf('SimplePie\Sanitize::pass_file_data() is deprecated since SimplePie 1.9.0, please use "SimplePie\Sanitize::set_http_client()" instead.'), \E_USER_DEPRECATED);
$this->timeout = $timeout;
$this->useragent = $useragent;
$this->force_fsockopen = $force_fsockopen;
$this->curl_options = $curl_options;
// Invalidate the registered client.
$this->http_client = null;
* @param string[]|string|false $tags Set a list of tags to strip, or set empty string to use default tags, or false to strip nothing.
public function strip_htmltags($tags = ['base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'])
$this->strip_htmltags = $tags;
$this->strip_htmltags = explode(',', $tags);
$this->strip_htmltags = [];
public function encode_instead_of_strip(bool $encode = false)
$this->encode_instead_of_strip = $encode;
* @param string[]|string $attribs
public function rename_attributes($attribs = [])
if (is_array($attribs)) {
$this->rename_attributes = $attribs;
$this->rename_attributes = explode(',', $attribs);
$this->rename_attributes = [];
* @param string[]|string $attribs
public function strip_attributes($attribs = ['bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'])
if (is_array($attribs)) {
$this->strip_attributes = $attribs;
$this->strip_attributes = explode(',', $attribs);
$this->strip_attributes = [];
* @param array<string, array<string, string>> $attribs
public function add_attributes(array $attribs = ['audio' => ['preload' => 'none'], 'iframe' => ['sandbox' => 'allow-scripts allow-same-origin'], 'video' => ['preload' => 'none']])
$this->add_attributes = $attribs;
public function strip_comments(bool $strip = false)
$this->strip_comments = $strip;
public function set_output_encoding(string $encoding = 'UTF-8')
$this->output_encoding = $encoding;
* Set element/attribute key/value pairs of HTML attributes
* containing URLs that need to be resolved relative to the feed
* Defaults to |a|@href, |area|@href, |audio|@src, |blockquote|@cite,
* |del|@cite, |form|@action, |img|@longdesc, |img|@src, |input|@src,
* |ins|@cite, |q|@cite, |source|@src, |video|@src
* @param array<string, string|string[]>|null $element_attribute Element/attribute key/value pairs, null for default
public function set_url_replacements(?array $element_attribute = null)
if ($element_attribute === null) {
$this->replace_url_attributes = $element_attribute;
* Set the list of domains for which to force HTTPS.
* @see \SimplePie\Misc::https_url()
* Example array('biz', 'example.com', 'example.org', 'www.example.net');
* @param string[] $domains list of domain names ['biz', 'example.com', 'example.org', 'www.example.net']
public function set_https_domains(array $domains)
$this->https_domains = [];
foreach ($domains as $domain) {
$domain = trim($domain, ". \t\n\r\0\x0B");
$segments = array_reverse(explode('.', $domain));
/** @var true|array<string, true|array<string, true|array<string, array<string, true|array<string, true|array<string, true>>>>>> */ // Needed for PHPStan.
$node = &$this->https_domains;
foreach ($segments as $segment) {//Build a tree
if (!isset($node[$segment])) {
$node = &$node[$segment];
* Check if the domain is in the list of forced HTTPS.
protected function is_https_domain(string $domain)
$domain = trim($domain, '. ');
$segments = array_reverse(explode('.', $domain));
$node = &$this->https_domains;
foreach ($segments as $segment) {//Explore the tree
if (isset($node[$segment])) {
$node = &$node[$segment];
* Force HTTPS for selected Web sites.
public function https_url(string $url)
strtolower(substr($url, 0, 7)) === 'http://'
&& ($parsed = parse_url($url, PHP_URL_HOST)) !== false // Malformed URL
&& $parsed !== null // Missing host
&& $this->is_https_domain($parsed) // Should be forced?
) ? substr_replace($url, 's', 4, 0) // Add the 's' to HTTPS
* @param int-mask-of<SimplePie::CONSTRUCT_*> $type
* @return string Sanitized data; false if output encoding is changed to something other than UTF-8 and conversion fails
public function sanitize(string $data, int $type, string $base = '')
if ($data !== '' || $type & \SimplePie\SimplePie::CONSTRUCT_IRI) {
if ($type & \SimplePie\SimplePie::CONSTRUCT_MAYBE_HTML) {
if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . \SimplePie\SimplePie::PCRE_HTML_ATTRIBUTE . '>)/', $data)) {
$type |= \SimplePie\SimplePie::CONSTRUCT_HTML;
$type |= \SimplePie\SimplePie::CONSTRUCT_TEXT;
if ($type & \SimplePie\SimplePie::CONSTRUCT_BASE64) {
$data = base64_decode($data);
if ($type & (\SimplePie\SimplePie::CONSTRUCT_HTML | \SimplePie\SimplePie::CONSTRUCT_XHTML)) {
if (!class_exists('DOMDocument')) {
throw new \SimplePie\Exception('DOMDocument not found, unable to use sanitizer');
$document = new \DOMDocument();
$document->encoding = 'UTF-8';
// PHPStan seems to have trouble resolving int-mask because bitwise
// operators are used when operators are used when passing this parameter.
// https://github.com/phpstan/phpstan/issues/9384
/** @var int-mask-of<SimplePie::CONSTRUCT_*> $type */
$data = $this->preprocess($data, $type);
set_error_handler([Misc::class, 'silence_errors']);
$document->loadHTML($data);
$xpath = new \DOMXPath($document);
if ($this->strip_comments) {
/** @var \DOMNodeList<\DOMComment> */
$comments = $xpath->query('//comment()');
foreach ($comments as $comment) {
$parentNode = $comment->parentNode;
assert($parentNode !== null, 'For PHPStan, comment must have a parent');
$parentNode->removeChild($comment);
// Strip out HTML tags and attributes that might cause various security problems.
// Based on recommendations by Mark Pilgrim at:
// https://web.archive.org/web/20110902041826/http://diveintomark.org:80/archives/2003/06/12/how_to_consume_rss_safely
if ($this->strip_htmltags) {
foreach ($this->strip_htmltags as $tag) {
$this->strip_tag($tag, $document, $xpath, $type);
if ($this->rename_attributes) {
foreach ($this->rename_attributes as $attrib) {
$this->rename_attr($attrib, $xpath);
if ($this->strip_attributes) {
foreach ($this->strip_attributes as $attrib) {
$this->strip_attr($attrib, $xpath);
if ($this->add_attributes) {
foreach ($this->add_attributes as $tag => $valuePairs) {
$this->add_attr($tag, $valuePairs, $document);
foreach ($this->replace_url_attributes as $element => $attributes) {
$this->replace_urls($document, $element, $attributes);
// If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
if ($this->image_handler !== '' && $this->enable_cache) {
$images = $document->getElementsByTagName('img');
foreach ($images as $img) {