trafilatura
Adapted from the Griptape AI Framework documentation.
__all__ = ['TrafilaturaWebScraperDriver']
module-attribute
Bases:
BaseWebScraperDriver
Source Code in griptape/drivers/web_scraper/trafilatura_web_scraper_driver.py
@define class TrafilaturaWebScraperDriver(BaseWebScraperDriver): include_links: bool = field(default=True, kw_only=True) no_ssl: bool = field(default=False, kw_only=True) def fetch_url(self, url: str) -> str: trafilatura = import_optional_dependency("trafilatura") use_config = trafilatura.settings.use_config config = use_config() page = trafilatura.fetch_url(url, no_ssl=self.no_ssl) # This disables signal, so that trafilatura can work on any thread: # More info: https://trafilatura.readthedocs.io/usage-python.html#disabling-signal config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0") # Disable error logging in trafilatura as it sometimes logs errors from lxml, even though # the end result of page parsing is successful. logging.getLogger("trafilatura").setLevel(logging.FATAL) if page is None: raise Exception("can't access URL") return page def extract_page(self, page: str) -> TextArtifact: trafilatura = import_optional_dependency("trafilatura") use_config = trafilatura.settings.use_config config = use_config() extracted_page = trafilatura.extract( page, include_links=self.include_links, output_format="json", config=config, ) if not extracted_page: raise Exception("can't extract page") text = json.loads(extracted_page).get("text") return TextArtifact(text)
include_links = field(default=True, kw_only=True)
class-attribute instance-attributeno_ssl = field(default=False, kw_only=True)
class-attribute instance-attribute
extract_page(page)
Source Code in griptape/drivers/web_scraper/trafilatura_web_scraper_driver.py
def extract_page(self, page: str) -> TextArtifact: trafilatura = import_optional_dependency("trafilatura") use_config = trafilatura.settings.use_config config = use_config() extracted_page = trafilatura.extract( page, include_links=self.include_links, output_format="json", config=config, ) if not extracted_page: raise Exception("can't extract page") text = json.loads(extracted_page).get("text") return TextArtifact(text)
fetch_url(url)
Source Code in griptape/drivers/web_scraper/trafilatura_web_scraper_driver.py
def fetch_url(self, url: str) -> str: trafilatura = import_optional_dependency("trafilatura") use_config = trafilatura.settings.use_config config = use_config() page = trafilatura.fetch_url(url, no_ssl=self.no_ssl) # This disables signal, so that trafilatura can work on any thread: # More info: https://trafilatura.readthedocs.io/usage-python.html#disabling-signal config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0") # Disable error logging in trafilatura as it sometimes logs errors from lxml, even though # the end result of page parsing is successful. logging.getLogger("trafilatura").setLevel(logging.FATAL) if page is None: raise Exception("can't access URL") return page
- On this page
- extract_page(page)
- fetch_url(url)
Could this page be better? Report a problem or suggest an addition!