Tuesday, January 9, 2018

Python script to Download Url Page Titles

from urllib.request import urlopen
import re
import sys
import os
filepath = 'urls.txt' 
with open(filepath) as fp: 
theurl = fp.readline()
while theurl:
if(not theurl.startswith('http')):
if(":443" in theurl):
theurl = 'https://' + theurl
else:
theurl = 'http://' + theurl
try:
html = urlopen(theurl, timeout=3)
titles = re.findall(r'<title>(.*?)</title>',str(html.read()))
if len(titles) > 0:
print(titles[0] + "," + theurl)
except:
print("ERROR," + theurl)
theurl = fp.readline()

----
input is text file with 1 url per line
----
sample results

No comments:

Post a Comment