diff options
author | Luca Tringali <tringalinvent@libero.it> | 2018-09-03 23:00:54 +0200 |
---|---|---|
committer | Luca Tringali <tringalinvent@libero.it> | 2018-09-03 23:00:54 +0200 |
commit | 6b76d666dd1cfb851639453140533a155080d1df (patch) | |
tree | 3348e7f3f0af43c24f183d6a2ab90b74223baee4 /scrapefb.py |
Prima commit
Diffstat (limited to 'scrapefb.py')
-rw-r--r-- | scrapefb.py | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/scrapefb.py b/scrapefb.py new file mode 100644 index 0000000..94c8c72 --- /dev/null +++ b/scrapefb.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +#Written by Luca Tringali +# The code is described here: +# https://www.codice-sorgente.it/2018/08/facebook-scraping-scaricare-tutti-i-post-delle-pagine-facebook/ +#Released under GNU GPL3 + +#USAGE: python3 ./scrapefb.py https://facebookpageurl/ ./ CSV +#The second argument is the output folder, the third one (in caps) is the format TXT or CSV + +import urllib.request +import urllib.parse +import re +import html +import sys +import os +import json +import datetime +import time +from socket import timeout + +useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' + +def geturl(thisurl): + global useragent + if thisurl == '': + return '' + req = urllib.request.Request( + thisurl, + data=None, + headers={ + 'User-Agent': useragent + } + ) + + thishtml = "" + try: + f = urllib.request.urlopen(req,timeout=300) + ft = f.read() #we should stop if this is taking too long + except: + ft = "" + try: + encoding = f.info().get_content_charset() + if encoding == None: + encoding = 'windows-1252' + thishtml = ft.decode(encoding) + except: + try: + thishtml = ft.decode('utf-8', 'backslashreplace') + except: + thishtml = str(ft) + try: + thishtml = html.unescape(thishtml) + except: + thishtml = "" + return thishtml + + +def scrapefacebook(mypage, output = "./", ascsv = False): + TOSELECT_FB = 'pages_reaction_units' + startposts = "{\"__html\":" + endposts = "]]" + maxresults = 300 + towait = 10 + lstart = '/pages_reaction_units/more/?page_id=' + lending = '&cursor={"card_id":"videos","has_next_page":true}&surface=www_pages_home&unit_count='+str(maxresults)+'&referrer&dpr=1&__user=0&__a=1' + allhtml = geturl(mypage) + try: + start = mypage.index("https://") + end = mypage.index('/',start+8) + fbdomain = mypage[start:end] + indexes = [(m.start(0), m.end(0)) for m in re.finditer(TOSELECT_FB, allhtml[start+1:])] + start = indexes[0][0] + end = allhtml.index('"',start+1) + thislink = allhtml[start:end] + #estrapola page_id + #getting page ID from page source HTML + #https://it-it.facebook.com/pages_reaction_units/more/?page_id=286796408016028&cursor={"card_id":"videos","has_next_page":true}&surface=www_pages_home&unit_count=300&referrer&dpr=1&__user=0&__a=1 + start = thislink.index("page_id=") + end = thislink.index('&',start+9) + pageid = thislink[start+8:end] + start = mypage.index("facebook.com") + pagename = mypage[start+12:] + pagename = re.sub(r'[^A-Za-z0-9]',"",pagename) + except: + fbdomain = "" + pageid = "" + fname = output + "fb_" + pagename + ".txt" + if ascsv: + fname = output + "fb_" + pagename + ".csv" + alllinks = [] + linksfile = output + "fb_" + pagename + ".tmp" + if os.path.isfile(linksfile): + alllinks = [line.rstrip('\n') for line in open(linksfile, encoding='utf-8')] + timelineiter = 0 + ripristino = False + active = True + while active: + link = fbdomain + lstart + pageid + lending + if timelineiter == 0 and len(alllinks)>0: + link = alllinks[len(alllinks)-1] + ripristino = True + with open(linksfile, "a", encoding='utf-8') as lfile: + lfile.write(link+'\n') + print(link) + newhtml = geturl(link) + try: + start = newhtml.index(startposts) + end = newhtml.index(endposts) + postshtml = newhtml[start:end] + #eliminazione caratteri unicode surrogati + #deleting unicode surrogates + postshtml = postshtml.encode("utf-8").decode('unicode-escape') + postshtml = re.sub(r'[\uD800-\uDFFF]',"",postshtml) + #dividi per data-utime e tieni solo quello che sta tra <p> e </p> + #splitting posts by their Unitx Time (data-utime) and keeping only text inside <p> </p> tags + postsarray = re.split('data-utime', postshtml) + timearray = [] + for i in range(len(postsarray)): + try: + start = postsarray[i].index('"') + end = postsarray[i].index('"',start+2) + utime = postsarray[i][start:end] + utime = re.sub(r'[^0-9]',"",utime) + utimei = int(utime) + except: + utimei = 0 + thistime = datetime.datetime.utcfromtimestamp(utimei).strftime('%Y-%m-%d %H:%M:%S') + timearray.append(thistime) + indexes = [(m.start(0), m.end(0)) for m in re.finditer('<p>(.*?)<\\\\/p>', postsarray[i])] + thispost = "" + for n in range(len(indexes)): + start = indexes[n][0] + end = indexes[n][1] + thispost = thispost + postsarray[i][start:end] + #pulisco i tag non necessari + #cleaning unnecessary tags + postsarray[i] = re.sub(r'<.*?>',"",thispost) + #tolgo gli slash non necessari + #cleaning unnecessary slashes + postsarray[i] = re.sub('\\\\/',"/",postsarray[i]) + print(postsarray) + try: + maxresults = 8 + start = newhtml.index('&cursor=') + end = newhtml.index("&unit_count=", start+1) + lending = newhtml[start:end] + #eliminazione caratteri unicode surrogati + #again, cleaning unicode surrogates + lending = lending.encode("utf-8").decode('unicode-escape') + lending = re.sub(r'[\uD800-\uDFFF]',"",lending) + lending = urllib.parse.unquote(lending) + lending = lending + '&unit_count='+str(maxresults)+'&dpr=1&__user=0&__a=1' + #https://it-it.facebook.com/pages_reaction_units/more/?page_id=286796408016028&cursor={"timeline_cursor":"timeline_unit:1:00000000001528624041:04611686018427387904:09223372036854775793:04611686018427387904","timeline_section_cursor":{},"has_next_page":true}&surface=www_pages_home&unit_count=8&dpr=1&__user=0&__a=1 + except: + active = False + except: + postsarray = [] + timearray = [] + #salvo il risultato in un file: se รจ il primo ciclo creo il file, altrimenti aggiungo + #saving results in a file, creating a new one or appending to an existing one if needed + if fname != "": + postsfile = "" + for i in range(len(postsarray)): + if postsarray[i] != "": + if ascsv: + postsfile = postsfile + timearray[i] + "\t" + postsfile = postsfile + postsarray[i] + "\n" + if timelineiter == 0 and ripristino==False: + text_file = open(fname, "w", encoding='utf-8') + text_file.write(postsfile) + text_file.close() + else: + with open(fname, "a", encoding='utf-8') as myfile: + myfile.write(postsfile) + timelineiter = timelineiter +1 + time.sleep(towait) + + +if __name__ == '__main__': + START_PAGE = "https://it-it.facebook.com/chiesapastafarianaitaliana/" + if len(sys.argv)>1: + START_PAGE = sys.argv[1] + if "facebook.com" in START_PAGE: + output = "./" + if len(sys.argv)>2: + output = sys.argv[2] + ascsv = False + if len(sys.argv)>3: + if sys.argv[3] == "CSV": + ascsv = True + scrapefacebook(START_PAGE, output, ascsv) + |