Skip to content Skip to sidebar Skip to footer

Scrape Multiple Urls Using Qwebpage

I'm using Qt's QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as urllib2) wo

Solution 1:

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.

Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.

Here are some examples which show how to use the WebPage class:

Usage:

defmy_html_processor(html, url):
    print('loaded: [%d chars] %s' % (len(html), url))

import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)

# example 1: process list of urls

urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3print('Processing list of urls...')
webpage.process(urls)

# example 2: process one url continuously## import signal, itertools# signal.signal(signal.SIGINT, signal.SIG_DFL)## print('Processing url continuously...')# print('Press Ctrl+C to quit')## url = 'https://en.wikipedia.org/wiki/Special:Random'# webpage.process(itertools.repeat(url))

sys.exit(app.exec_())

PyQt5 WebPage:

from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage

classWebPage(QWebEnginePage):
    htmlReady = pyqtSignal(str, str)

    def__init__(self, verbose=False):
        super().__init__()
        self._verbose = verbose
        self.loadFinished.connect(self.handleLoadFinished)

    defprocess(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    deffetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            returnFalseelse:
            self.load(QUrl(url))
        returnTruedefprocessCurrentPage(self, html):
        self.htmlReady.emit(html, self.url().toString())
        ifnot self.fetchNext():
            QApplication.instance().quit()

    defhandleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    defjavaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super().javaScriptConsoleMessage(*args, **kwargs)

PyQt4 WebPage:

from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

classWebPage(QWebPage):
    htmlReady = pyqtSignal(str, str)

    def__init__(self, verbose=False):
        super(WebPage, self).__init__()
        self._verbose = verbose
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    defstart(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    deffetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            returnFalseelse:
            self.mainFrame().load(QUrl(url))
        returnTruedefprocessCurrentPage(self):
        self.htmlReady.emit(
            self.mainFrame().toHtml(), self.mainFrame().url().toString())
        print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

    defhandleLoadFinished(self):
        self.processCurrentPage()
        ifnot self.fetchNext():
            QApplication.instance().quit()

    defjavaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)

Post a Comment for "Scrape Multiple Urls Using Qwebpage"