summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca Tringali <tringalinvent@libero.it>2018-09-03 23:00:54 +0200
committerLuca Tringali <tringalinvent@libero.it>2018-09-03 23:00:54 +0200
commit6b76d666dd1cfb851639453140533a155080d1df (patch)
tree3348e7f3f0af43c24f183d6a2ab90b74223baee4
Prima commit
-rw-r--r--scrapefb.py194
1 files changed, 194 insertions, 0 deletions
diff --git a/scrapefb.py b/scrapefb.py
new file mode 100644
index 0000000..94c8c72
--- /dev/null
+++ b/scrapefb.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#Written by Luca Tringali
+# The code is described here:
+# https://www.codice-sorgente.it/2018/08/facebook-scraping-scaricare-tutti-i-post-delle-pagine-facebook/
+#Released under GNU GPL3
+
+#USAGE: python3 ./scrapefb.py https://facebookpageurl/ ./ CSV
+#The second argument is the output folder, the third one (in caps) is the format TXT or CSV
+
+import urllib.request
+import urllib.parse
+import re
+import html
+import sys
+import os
+import json
+import datetime
+import time
+from socket import timeout
+
+useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
+
+def geturl(thisurl):
+ global useragent
+ if thisurl == '':
+ return ''
+ req = urllib.request.Request(
+ thisurl,
+ data=None,
+ headers={
+ 'User-Agent': useragent
+ }
+ )
+
+ thishtml = ""
+ try:
+ f = urllib.request.urlopen(req,timeout=300)
+ ft = f.read() #we should stop if this is taking too long
+ except:
+ ft = ""
+ try:
+ encoding = f.info().get_content_charset()
+ if encoding == None:
+ encoding = 'windows-1252'
+ thishtml = ft.decode(encoding)
+ except:
+ try:
+ thishtml = ft.decode('utf-8', 'backslashreplace')
+ except:
+ thishtml = str(ft)
+ try:
+ thishtml = html.unescape(thishtml)
+ except:
+ thishtml = ""
+ return thishtml
+
+
+def scrapefacebook(mypage, output = "./", ascsv = False):
+ TOSELECT_FB = 'pages_reaction_units'
+ startposts = "{\"__html\":"
+ endposts = "]]"
+ maxresults = 300
+ towait = 10
+ lstart = '/pages_reaction_units/more/?page_id='
+ lending = '&cursor={"card_id":"videos","has_next_page":true}&surface=www_pages_home&unit_count='+str(maxresults)+'&referrer&dpr=1&__user=0&__a=1'
+ allhtml = geturl(mypage)
+ try:
+ start = mypage.index("https://")
+ end = mypage.index('/',start+8)
+ fbdomain = mypage[start:end]
+ indexes = [(m.start(0), m.end(0)) for m in re.finditer(TOSELECT_FB, allhtml[start+1:])]
+ start = indexes[0][0]
+ end = allhtml.index('"',start+1)
+ thislink = allhtml[start:end]
+ #estrapola page_id
+ #getting page ID from page source HTML
+ #https://it-it.facebook.com/pages_reaction_units/more/?page_id=286796408016028&cursor={"card_id":"videos","has_next_page":true}&surface=www_pages_home&unit_count=300&referrer&dpr=1&__user=0&__a=1
+ start = thislink.index("page_id=")
+ end = thislink.index('&',start+9)
+ pageid = thislink[start+8:end]
+ start = mypage.index("facebook.com")
+ pagename = mypage[start+12:]
+ pagename = re.sub(r'[^A-Za-z0-9]',"",pagename)
+ except:
+ fbdomain = ""
+ pageid = ""
+ fname = output + "fb_" + pagename + ".txt"
+ if ascsv:
+ fname = output + "fb_" + pagename + ".csv"
+ alllinks = []
+ linksfile = output + "fb_" + pagename + ".tmp"
+ if os.path.isfile(linksfile):
+ alllinks = [line.rstrip('\n') for line in open(linksfile, encoding='utf-8')]
+ timelineiter = 0
+ ripristino = False
+ active = True
+ while active:
+ link = fbdomain + lstart + pageid + lending
+ if timelineiter == 0 and len(alllinks)>0:
+ link = alllinks[len(alllinks)-1]
+ ripristino = True
+ with open(linksfile, "a", encoding='utf-8') as lfile:
+ lfile.write(link+'\n')
+ print(link)
+ newhtml = geturl(link)
+ try:
+ start = newhtml.index(startposts)
+ end = newhtml.index(endposts)
+ postshtml = newhtml[start:end]
+ #eliminazione caratteri unicode surrogati
+ #deleting unicode surrogates
+ postshtml = postshtml.encode("utf-8").decode('unicode-escape')
+ postshtml = re.sub(r'[\uD800-\uDFFF]',"",postshtml)
+ #dividi per data-utime e tieni solo quello che sta tra <p> e </p>
+ #splitting posts by their Unitx Time (data-utime) and keeping only text inside <p> </p> tags
+ postsarray = re.split('data-utime', postshtml)
+ timearray = []
+ for i in range(len(postsarray)):
+ try:
+ start = postsarray[i].index('"')
+ end = postsarray[i].index('"',start+2)
+ utime = postsarray[i][start:end]
+ utime = re.sub(r'[^0-9]',"",utime)
+ utimei = int(utime)
+ except:
+ utimei = 0
+ thistime = datetime.datetime.utcfromtimestamp(utimei).strftime('%Y-%m-%d %H:%M:%S')
+ timearray.append(thistime)
+ indexes = [(m.start(0), m.end(0)) for m in re.finditer('<p>(.*?)<\\\\/p>', postsarray[i])]
+ thispost = ""
+ for n in range(len(indexes)):
+ start = indexes[n][0]
+ end = indexes[n][1]
+ thispost = thispost + postsarray[i][start:end]
+ #pulisco i tag non necessari
+ #cleaning unnecessary tags
+ postsarray[i] = re.sub(r'<.*?>',"",thispost)
+ #tolgo gli slash non necessari
+ #cleaning unnecessary slashes
+ postsarray[i] = re.sub('\\\\/',"/",postsarray[i])
+ print(postsarray)
+ try:
+ maxresults = 8
+ start = newhtml.index('&cursor=')
+ end = newhtml.index("&unit_count=", start+1)
+ lending = newhtml[start:end]
+ #eliminazione caratteri unicode surrogati
+ #again, cleaning unicode surrogates
+ lending = lending.encode("utf-8").decode('unicode-escape')
+ lending = re.sub(r'[\uD800-\uDFFF]',"",lending)
+ lending = urllib.parse.unquote(lending)
+ lending = lending + '&unit_count='+str(maxresults)+'&dpr=1&__user=0&__a=1'
+ #https://it-it.facebook.com/pages_reaction_units/more/?page_id=286796408016028&cursor={"timeline_cursor":"timeline_unit:1:00000000001528624041:04611686018427387904:09223372036854775793:04611686018427387904","timeline_section_cursor":{},"has_next_page":true}&surface=www_pages_home&unit_count=8&dpr=1&__user=0&__a=1
+ except:
+ active = False
+ except:
+ postsarray = []
+ timearray = []
+ #salvo il risultato in un file: se รจ il primo ciclo creo il file, altrimenti aggiungo
+ #saving results in a file, creating a new one or appending to an existing one if needed
+ if fname != "":
+ postsfile = ""
+ for i in range(len(postsarray)):
+ if postsarray[i] != "":
+ if ascsv:
+ postsfile = postsfile + timearray[i] + "\t"
+ postsfile = postsfile + postsarray[i] + "\n"
+ if timelineiter == 0 and ripristino==False:
+ text_file = open(fname, "w", encoding='utf-8')
+ text_file.write(postsfile)
+ text_file.close()
+ else:
+ with open(fname, "a", encoding='utf-8') as myfile:
+ myfile.write(postsfile)
+ timelineiter = timelineiter +1
+ time.sleep(towait)
+
+
+if __name__ == '__main__':
+ START_PAGE = "https://it-it.facebook.com/chiesapastafarianaitaliana/"
+ if len(sys.argv)>1:
+ START_PAGE = sys.argv[1]
+ if "facebook.com" in START_PAGE:
+ output = "./"
+ if len(sys.argv)>2:
+ output = sys.argv[2]
+ ascsv = False
+ if len(sys.argv)>3:
+ if sys.argv[3] == "CSV":
+ ascsv = True
+ scrapefacebook(START_PAGE, output, ascsv)
+