From b9428d9af77a1ea050115c9ac9a1bd7acb0421a4 Mon Sep 17 00:00:00 2001 From: TKE Date: Wed, 23 Dec 2020 11:48:59 +0100 Subject: [PATCH] Add geurls.py geturls.py prints links from html file. html can be a file on disk or url starting with http/https --- geturls.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 geturls.py diff --git a/geturls.py b/geturls.py new file mode 100755 index 0000000..be24cf1 --- /dev/null +++ b/geturls.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +import sys +from bs4 import BeautifulSoup + +if sys.argv[1].startswith("http://") or sys.argv[1].startswith("https://"): + import requests + response = requests.get(sys.argv[1]) + data = response.content +else: + with open(sys.argv[1],'rt',encoding='ISO-8859-1') as f: + data=f.read() + +page=str(BeautifulSoup(data,features="lxml")) + +def getURL(page): + start_link = page.find("a href") + if start_link == -1: + return None, 0 + start_quote = page.find('"', start_link) + end_quote = page.find('"', start_quote + 1) + url = page[start_quote + 1: end_quote] + return url, end_quote + + +while True: + url, n = getURL(page) + page = page[n:] + if url: + print(url) + else: + break