I.T. Spices The LINUX Way
Python In The Shell: The STEEMIT Ecosystem – Post #110
SCRAPING ALL BLOGS USING PYTHON - INTRO
Have you ever asked yourself how to gather all details of an account using all his/her blogs?
In this first blog of I will present a simple and effective way of doing so using a combination of web scraping and the steemit python module.
HOW THIS SCRIPT DOES ITS JOB
This python script will take 2x arguments, the first being the steemit account and the next will be the number of posts we want to gather. So if we want to get all post, a big number (like 1000) can be tried.
The script finds the post number starting from the number as given, and decrements by 10 in every cycle until all posts are gathered, with all important data laid out, namely:
- the full URL of the blog post
- the title of the blog post
- the date as to when the blog was posted
- how much rewards as given to the blog post
- the body of the blog post, with video and image links (if applicable)
- the comments on the blog post
- the detailed entry of the upvoters of the blog post
- the tags pf the blog post
We can also gather only the last 20 posts for example. For this, we just need to input the number 20 as the second argument.
The whole python script is laid down below for this intro post, with line numbers:
1 #!/usr/bin/python3.6
2
3 ###MODULES
4 import sys, os
5 import shutil
6 import requests
7 import re
8 from bs4 import BeautifulSoup
9 from steem import Steem
10 from steem.post import Post
11
12 ###MAKE TEMP DIR
13 tempdir = '/dev/shm/steemblogs'
14 shutil.rmtree(tempdir, ignore_errors=True)
15 os.mkdir(tempdir)
16
17 ###OPEN TEMP LOGS FILE FOR THE LOGS
18 flogs = open(tempdir + '/templogs', 'a+')
19
20 ###GET BLOGS HERE
21 s = Steem('https://api.steemit.com')
22 baseurl = 'https://steemit.com/'
23 account = sys.argv[1]
24 start = sys.argv[2]
25 if int(start) == 10:
26 end = 11
27 elif int(start) > 10:
28 end = 10
29 elif int(start) < 10:
30 print()
31 print('START variable needs to be equal to or greater than 10 and be divisible by 10. Please 32 try again.......')
33 print()
34 sys.exit()
35 counter = int(start)
36 while int(counter) >= 0:
37 ###INDICATE THE COUNTER
38 print('\n' + 'COUNTER is now ' + str(counter))
39 flogs.write('\n' + 'COUNTER is now ' + str(counter))
40 for post in s.get_blog(account, int(start), int(end)):
41 post = Post(post["comment"])
42 # SHOULD BE MAIN POST AND NOT RESTEEMED POST FROM OTHER ACCOUNTS
43 if post.is_main_post() and post["author"] == account:
44 permlink = post["permlink"]
45 a = (post["tags"])
46 tags = list(a)
47 i = 1
48 for i in range(5):
49 for bb in tags:
50 tag = bb
51 if tag != "":
52 i = i + 10
53 break
54 else:
55 i = i + 1
56
57 #######################PRINT BORDER START OF EVERY RESULT
58 print('\n#############################################################################################')
59 flogs.write('\n#############################################################################################')
60
61 ###PRINT THE WEB URL COMPLETED
62 fullurl = (baseurl + tag + '/@' + account + '/' + permlink)
63 print('\nPOST URL:\n' + ' ' + fullurl)
64 flogs.write('\nPOST URL:' + '\n ' + fullurl)
65 os.system('echo ' + fullurl + ' >> ' + tempdir + '/weburls')
66
67 ###SCRAPE THE URL RIGHT AWAY
68 my_url = fullurl
69 headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
70 open_url = requests.get(my_url, headers=headers)
71 html_url = open_url.content
72 soup = BeautifulSoup(html_url,"html.parser")
73
74 #POST TITLE
75 ttt = soup.find('h1', {'class':'entry-title'})
76 if ttt != []:
77 try:
78 title = ttt.text
79 print('\nTITLE:')
80 flogs.write('\nTITLE:')
81 print(' ' + str(title))
82 flogs.write('\n ' + str(title))
83 except:
84 print('\nTITLE:')
85 flogs.write('\nTITLE:')
86 print(' No TITLE found.......')
87 flogs.write('\n No TITLE found.......')
88
89 #DATE OF THE BLOG POST
90 postdate = str(post["created"].date())
91 print('\nPOST DATE:')
92 flogs.write('\nPOST DATE:')
93 print(' ' + postdate)
94 flogs.write('\n ' + postdate)
95
96 #REWARDS IN USD
97 rrr = soup.findAll('div', {'class':'DropdownMenu'})
98 try:
99 rewards = (rrr[0].a.span.span).text.replace(' ', '')
100 print('\nREWARDS:')
101 flogs.write('\nREWARDS:')
102 print(' ' + rewards)
103 flogs.write('\n ' + rewards)
104 except:
105 print('\nREWARDS:')
106 flogs.write('\nREWARDS:')
107 print(' ' + 'No rewards parsed...')
108 flogs.write('\n ' + 'No rewards parsed...')
109
110 #BODY
111 print('\nBODY:')
112 flogs.write('\nBODY:')
113 par = soup.findAll('p')
114 vid = soup.findAll('div', {'class':'videoWrapper youtube'})
115 img = soup.findAll('img')
116 #FOR YOUTUBE VIDEOS
117 if vid != []:
118 print('\n' + ' VIDEO link/s:')
119 flogs.write('\n' + ' VIDEO link/s:')
120 for v in vid:
121 jpglink = re.search('\(.+?\)', str(v))
122 imglink = jpglink.string[jpglink.start():jpglink.end()]
123 vidlink = os.popen('echo "' + str(imglink) + '" | cut --delimiter=/ -f5').read()
124 print(' https://www.youtube.com/watch?v=' + str(vidlink))
125 flogs.write('\n https://www.youtube.com/watch?v=' + str(vidlink))
126 #FOR IMAGES
127 if img != []:
128 print('\n' + ' IMAGE link/s:')
129 flogs.write('\n' + ' IMAGE link/s:')
130 for i in img:
131 imagelink = re.search('\".+?\"', str(i))
132 imageurl = imagelink.string[imagelink.start():imagelink.end()]
133 print(' ' + str(imageurl))
134 flogs.write('\n ' + str(imageurl))
135 #PARAGRAPHS TEXT
136 if par != []:
137 print('\n' + ' TEXT Paragraphs:')
138 flogs.write('\n' + ' TEXT Paragraphs:')
139 for sss in par:
140 body = (sss.text).replace('\n', '\n ')
141 print(' ' + body)
142 flogs.write('\n ' + body)
143 with open('/dev/shm/steemblogs/samplesoup', 'a+') as f:
145 f.write('\n##############################################################################################################################\n')
146 f.write(soup.prettify())
147 f.write('\n##############################################################################################################################\n')
148
149 #COMMENTS
150 ccc = soup.findAll('div', {'class':'Comment__body entry-content'})
151 if ccc != []:
152 c = []
153 for cc in ccc:
154 c.append(cc.text.strip())
155 print('\nCOMMENTS:')
156 flogs.write('\nCOMMENTS:')
157 for d in c:
158 comment = d.replace('\n', '\n ')
159 print(' ' + comment)
160 flogs.write('\n ' + comment)
161
162 ###GET THE UPVOTERS
163 #PYTHON WAY WHO UPVOTED
164 voters = s.get_active_votes(sys.argv[1], permlink)
165 v = list(voters)
166 print('\nVOTERS:')
167 flogs.write('\nVOTERS:')
168 for vv in v:
169 print(' ' + str(vv))
170 flogs.write('\n ' + str(vv))
171
172 ###POST TAGS
173 print('\nTAGS:')
174 flogs.write('\nTAGS:')
175 for tag in tags:
176 print(' ' + tag)
177 flogs.write('\n ' + tag)
178
179 #######################PRINT BORDER END OF EVERY RESULT
180 print('\n###############################################################')
181 flogs.write('\n###############################################################')
182
183 ###CLOSE THE URL
184 open_url.close()
185
186 ###INCREMENT COUNTERS HERE
187 counter = int(counter) - 1
188
189 ###INCREMENT COUNTERS BY 10
190 start = int(start) - 10
191 if int(start) > 0 and int(start) == 10:
192 end = 11
193 if int(start) <= 0:
194 counter = -1
195
196 ###CLOSE THE TEMP LOGS FILE
197 flogs.close()
We will discuss each segment in the next posts for our digital forensics of the steemit blogs.
“A New Year, A New Hope……. Always.”