Prima commit

author: Luca Tringali <tringalinvent@libero.it> 2018-09-03 23:00:54 +0200
committer: Luca Tringali <tringalinvent@libero.it> 2018-09-03 23:00:54 +0200
commit: 6b76d666dd1cfb851639453140533a155080d1df (patch)
tree: 3348e7f3f0af43c24f183d6a2ab90b74223baee4
1 files changed, 194 insertions, 0 deletions
diff --git a/scrapefb.py b/scrapefb.py
new file mode 100644
index 0000000..94c8c72
--- /dev/null
+++ b/scrapefb.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#Written by Luca Tringali
+# The code is described here:
+# https://www.codice-sorgente.it/2018/08/facebook-scraping-scaricare-tutti-i-post-delle-pagine-facebook/
+#Released under GNU GPL3
+
+#USAGE: python3 ./scrapefb.py https://facebookpageurl/ ./ CSV
+#The second argument is the output folder, the third one (in caps) is the format TXT or CSV
+
+import urllib.request
+import urllib.parse
+import re
+import html
+import sys
+import os
+import json
+import datetime
+import time
+from socket import timeout
+
+useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
+
+def geturl(thisurl):
+        global useragent
+        if thisurl == '':
+            return ''
+        req = urllib.request.Request(
+            thisurl,
+            data=None,
+            headers={
+                'User-Agent': useragent
+            }
+        )
+
+        thishtml = ""
+        try:
+            f = urllib.request.urlopen(req,timeout=300)
+            ft = f.read() #we should stop if this is taking too long
+        except:
+            ft = ""
+        try:
+            encoding = f.info().get_content_charset()
+            if encoding == None:
+                encoding = 'windows-1252'
+            thishtml = ft.decode(encoding)
+        except:
+            try:
+               thishtml = ft.decode('utf-8', 'backslashreplace')
+            except:
+               thishtml = str(ft)
+        try:
+            thishtml = html.unescape(thishtml)
+        except:
+            thishtml = ""
+        return thishtml
+
+
+def scrapefacebook(mypage, output = "./", ascsv = False):
+    TOSELECT_FB = 'pages_reaction_units'
+    startposts = "{\"__html\":"
+    endposts = "]]"
+    maxresults = 300
+    towait = 10
+    lstart = '/pages_reaction_units/more/?page_id='
+    lending = '&cursor={"card_id":"videos","has_next_page":true}&surface=www_pages_home&unit_count='+str(maxresults)+'&referrer&dpr=1&__user=0&__a=1'
+    allhtml = geturl(mypage)
+    try:
+        start = mypage.index("https://")
+        end = mypage.index('/',start+8)
+        fbdomain = mypage[start:end]
+        indexes = [(m.start(0), m.end(0)) for m in re.finditer(TOSELECT_FB, allhtml[start+1:])]
+        start = indexes[0][0]
+        end = allhtml.index('"',start+1)
+        thislink = allhtml[start:end]
+        #estrapola page_id
+        #getting page ID from page source HTML
+        #https://it-it.facebook.com/pages_reaction_units/more/?page_id=286796408016028&cursor={"card_id":"videos","has_next_page":true}&surface=www_pages_home&unit_count=300&referrer&dpr=1&__user=0&__a=1
+        start = thislink.index("page_id=")
+        end = thislink.index('&',start+9)
+        pageid = thislink[start+8:end]
+        start = mypage.index("facebook.com")
+        pagename = mypage[start+12:]
+        pagename = re.sub(r'[^A-Za-z0-9]',"",pagename)
+    except:
+        fbdomain = ""
+        pageid = ""
+    fname = output + "fb_" + pagename + ".txt"
+    if ascsv:
+        fname = output + "fb_" + pagename + ".csv"
+    alllinks = []
+    linksfile = output + "fb_" + pagename + ".tmp"
+    if os.path.isfile(linksfile):
+        alllinks = [line.rstrip('\n') for line in open(linksfile, encoding='utf-8')]
+    timelineiter = 0
+    ripristino = False
+    active = True
+    while active:
+        link = fbdomain + lstart + pageid + lending
+        if timelineiter == 0 and len(alllinks)>0:
+            link = alllinks[len(alllinks)-1]
+            ripristino = True
+        with open(linksfile, "a", encoding='utf-8') as lfile:
+            lfile.write(link+'\n')
+        print(link)
+        newhtml = geturl(link)
+        try:
+            start = newhtml.index(startposts)
+            end = newhtml.index(endposts)
+            postshtml = newhtml[start:end]
+            #eliminazione caratteri unicode surrogati
+            #deleting unicode surrogates
+            postshtml = postshtml.encode("utf-8").decode('unicode-escape')
+            postshtml = re.sub(r'[\uD800-\uDFFF]',"",postshtml)
+            #dividi per data-utime e tieni solo quello che sta tra <p> e </p>
+            #splitting posts by their Unitx Time (data-utime) and keeping only text inside <p> </p> tags
+            postsarray = re.split('data-utime', postshtml)
+            timearray = []
+            for i in range(len(postsarray)):
+                try:
+                    start = postsarray[i].index('"')
+                    end = postsarray[i].index('"',start+2)
+                    utime = postsarray[i][start:end]
+                    utime = re.sub(r'[^0-9]',"",utime)
+                    utimei = int(utime)
+                except:
+                    utimei = 0
+                thistime = datetime.datetime.utcfromtimestamp(utimei).strftime('%Y-%m-%d %H:%M:%S')
+                timearray.append(thistime)
+                indexes = [(m.start(0), m.end(0)) for m in re.finditer('<p>(.*?)<\\\\/p>', postsarray[i])]
+                thispost = ""
+                for n in range(len(indexes)):
+                    start = indexes[n][0]
+                    end = indexes[n][1]
+                    thispost = thispost + postsarray[i][start:end]
+                #pulisco i tag non necessari
+                #cleaning unnecessary tags
+                postsarray[i] = re.sub(r'<.*?>',"",thispost)
+                #tolgo gli slash non necessari
+                #cleaning unnecessary slashes
+                postsarray[i] = re.sub('\\\\/',"/",postsarray[i])
+            print(postsarray)
+            try:
+                maxresults = 8
+                start = newhtml.index('&cursor=')
+                end = newhtml.index("&unit_count=", start+1)
+                lending = newhtml[start:end]
+                #eliminazione caratteri unicode surrogati
+                #again, cleaning unicode surrogates
+                lending = lending.encode("utf-8").decode('unicode-escape')
+                lending = re.sub(r'[\uD800-\uDFFF]',"",lending)
+                lending = urllib.parse.unquote(lending)
+                lending = lending + '&unit_count='+str(maxresults)+'&dpr=1&__user=0&__a=1'
+                #https://it-it.facebook.com/pages_reaction_units/more/?page_id=286796408016028&cursor={"timeline_cursor":"timeline_unit:1:00000000001528624041:04611686018427387904:09223372036854775793:04611686018427387904","timeline_section_cursor":{},"has_next_page":true}&surface=www_pages_home&unit_count=8&dpr=1&__user=0&__a=1
+            except:
+                active = False
+        except:
+            postsarray = []
+            timearray = []
+        #salvo il risultato in un file: se è il primo ciclo creo il file, altrimenti aggiungo
+        #saving results in a file, creating a new one or appending to an existing one if needed
+        if fname != "":
+            postsfile = ""
+            for i in range(len(postsarray)):
+                if postsarray[i] != "":
+                    if ascsv:
+                        postsfile = postsfile + timearray[i] + "\t"
+                    postsfile = postsfile + postsarray[i] + "\n"
+            if timelineiter == 0 and ripristino==False:
+                text_file = open(fname, "w", encoding='utf-8')
+                text_file.write(postsfile)
+                text_file.close()
+            else:
+                with open(fname, "a", encoding='utf-8') as myfile:
+                    myfile.write(postsfile)
+        timelineiter = timelineiter +1
+        time.sleep(towait)
+
+
+if __name__ == '__main__':
+    START_PAGE = "https://it-it.facebook.com/chiesapastafarianaitaliana/"
+    if len(sys.argv)>1:
+        START_PAGE = sys.argv[1]
+    if "facebook.com" in START_PAGE:
+        output = "./"
+        if len(sys.argv)>2:
+            output = sys.argv[2]
+        ascsv = False
+        if len(sys.argv)>3:
+            if sys.argv[3] == "CSV":
+                ascsv = True
+        scrapefacebook(START_PAGE, output, ascsv)
+
author	Luca Tringali <tringalinvent@libero.it>	2018-09-03 23:00:54 +0200
committer	Luca Tringali <tringalinvent@libero.it>	2018-09-03 23:00:54 +0200
commit	6b76d666dd1cfb851639453140533a155080d1df (patch)
tree	3348e7f3f0af43c24f183d6a2ab90b74223baee4