如何使用PHP递归地获取网站中所有页面中的所有链接


How to get all links in all pages in web site recursively using PHP?

这是我的代码示例。它只能扫描一个网页并打印该网页上的所有链接。

我需要递归地解析扫描整个网站,并打印该网站所有页面的所有链接。

这是我班的一个例子:

<?php
class ParseLinks
{
    private $sRootLink;
    private $iCountOfPages; //The number of links that recursively prints
    private $iCounter = 0;
    private $cache = array();
    public function __construct($sRootLink, $iCountOfPages)
    {
        $this->sRootLink = $sRootLink;
        $this->iCountOfPages = $iCountOfPages;
    }
    public function getRootLink()
    {
        return $this->sRootLink;
    }
    public function getCountOfPages()
    {
        return $this->iCountOfPages;
    }
    public function setRootLink($sRootLink)
    {
        $this->sRootLink = $sRootLink;
    }
    public function setCountOfPages($iCountOfPages)
    {
        $this->iCountOfPages = $iCountOfPages;
    }
    public function getAllLinks()
    {
        $this->rec($this->sRootLink);
    }
    private function rec($link)
    {
        $this->cache[$link] = true;
        $html = file_get_contents($link);
        $DOM = new DOMDocument;
        @$DOM->loadHTML($html);
        $links = $DOM->getElementsByTagName('a');
        //-----------------
        $sPatternURL = $this->sRootLink;
        foreach ($links as $element) {
            if($this->iCounter == $this->iCountOfPages)
                break;
            if($this->startsWith($element->getAttribute("href"), $sPatternURL))
            {
                echo $element->getAttribute("href") . "<br>";
                $this->iCounter++;
                //$this->rec($element->getAttribute("href"));
            }
        }
    }
    private function startsWith($haystack, $needle)
    {
        // search backwards starting from haystack length characters from the end
        return $needle === "" || strrpos($haystack, $needle, -strlen($haystack)) !== false;
    }
}

如果有人需要,这里是我的版本。工作正常。下面是我班的一个例子:在入口处,输入站点和要显示的链接数。

<?php
ini_set('error_reporting', E_ALL);
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ERROR);
set_time_limit(15000);
class ParseLinks
{
    private $sRootLink;
    private $iCountOfPages;
    private $linkArray = array();
    private $iDeep;
    private $sDomain;
    private $sScheme;
public function __construct($sRootLink, $iCountOfPages)
{
    $this->sRootLink = $sRootLink;
    $this->iCountOfPages = $iCountOfPages;
    $this->iDeep = 0;
    $this->sDomain = "";
    $this->sScheme = "";
}
public function getAllLinks()
{
    $this->recParseLinks($this->sRootLink);
    $this->printLinks();
    $this->saveToCSV();
}
private function printLinks()
{
    echo "Web-site: www." . $this->sDomain . "</br>Count of links: " . count($this->linkArray) . "</br></br>";
    foreach($this->linkArray as $element)
        echo "<a href='"" . $element . "'">" . $element . "</a>" . "<br>";
}
private function saveToCSV()
{
    $fp = fopen("allLinksFromYourSite.csv", "w");
    fwrite($fp, "Web-site: $this->sDomain" . PHP_EOL);
    fwrite($fp, "Count of links: " . count($this->linkArray) . PHP_EOL . PHP_EOL);
    foreach($this->linkArray as $element)
        fwrite($fp, $element . PHP_EOL);
    fclose($fp);
}
private function recParseLinks($link)
{
    if(strlen($link) <= 1)
        return;
    if($this->iDeep == 0)
    {
        $d = parse_url($link);
        if($d != false)
        {
            $this->sDomain = $d['host'];
            $this->sScheme = $d['scheme'];
        }
        else
            return;
    }
    $this->iDeep++;
    $doc = new DOMDocument();
    $doc->loadHTML(file_get_contents($link));
    $elements = $doc->getElementsByTagName('a');
    foreach($elements as $element)
    {
        if(count($this->linkArray) >= $this->iCountOfPages)
            return;
        $links = $element->getAttribute('href');
        if($links[0] == '/' || $links[0] == '?')
            $links = $this->sScheme . "://" . $this->sDomain . $links;
        $p_links = parse_url($links);
        if($p_links == FALSE)
            continue;
        if($p_links["host"] != $this->sDomain)
            continue;
        if(!$this->linkExists($links) && strlen($links) > 1)
        {
            $this->linkArray[] = $links;
            if($this->iDeep < 4)
            {
                $this->recParseLinks($links);
            }
        }
    }
    $this->iDeep--;
}
private function linkExists($link)
{
    foreach($this->linkArray as $element)
        if($element == $link)
            return true;
    return false;
}


}
$parseLinksObject = new ParseLinks('https://yoursite.com/', 3000);
$parseLinksObject->getAllLinks();