1
+ import urllib .request , urllib .parse , urllib .error ,urllib .request ,urllib .error ,urllib .parse ,json ,re ,datetime ,sys ,http .cookiejar
2
+ from .. import models
3
+ from pyquery import PyQuery
4
+
5
+ class TweetManager :
6
+
7
+ def __init__ (self ):
8
+ pass
9
+
10
+ @staticmethod
11
+ def getTweets (tweetCriteria , receiveBuffer = None , bufferLength = 100 ):
12
+ refreshCursor = ''
13
+
14
+ results = []
15
+ resultsAux = []
16
+ cookieJar = http .cookiejar .CookieJar ()
17
+
18
+ active = True
19
+
20
+ while active :
21
+ json = TweetManager .getJsonReponse (tweetCriteria , refreshCursor , cookieJar )
22
+ if len (json ['items_html' ].strip ()) == 0 :
23
+ break
24
+
25
+ refreshCursor = json ['min_position' ]
26
+ tweets = PyQuery (json ['items_html' ])('div.js-stream-tweet' )
27
+
28
+ if len (tweets ) == 0 :
29
+ break
30
+
31
+ for tweetHTML in tweets :
32
+ tweetPQ = PyQuery (tweetHTML )
33
+ tweet = models .Tweet ()
34
+
35
+ usernameTweet = tweetPQ ("span.username.js-action-profile-name b" ).text ();
36
+ txt = re .sub (r"\s+" , " " , tweetPQ ("p.js-tweet-text" ).text ().replace ('# ' , '#' ).replace ('@ ' , '@' ));
37
+ retweets = int (tweetPQ ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr ("data-tweet-stat-count" ).replace ("," , "" ));
38
+ favorites = int (tweetPQ ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr ("data-tweet-stat-count" ).replace ("," , "" ));
39
+ dateSec = int (tweetPQ ("small.time span.js-short-timestamp" ).attr ("data-time" ));
40
+ id = tweetPQ .attr ("data-tweet-id" );
41
+ permalink = tweetPQ .attr ("data-permalink-path" );
42
+ user_id = int (tweetPQ ("a.js-user-profile-link" ).attr ("data-user-id" ))
43
+
44
+ geo = ''
45
+ geoSpan = tweetPQ ('span.Tweet-geo' )
46
+ if len (geoSpan ) > 0 :
47
+ geo = geoSpan .attr ('title' )
48
+ urls = []
49
+ for link in tweetPQ ("a" ):
50
+ try :
51
+ urls .append ((link .attrib ["data-expanded-url" ]))
52
+ except KeyError :
53
+ pass
54
+ tweet .id = id
55
+ tweet .permalink = 'https://twitter.com' + permalink
56
+ tweet .username = usernameTweet
57
+
58
+ tweet .text = txt
59
+ tweet .date = datetime .datetime .fromtimestamp (dateSec )
60
+ tweet .formatted_date = datetime .datetime .fromtimestamp (dateSec ).strftime ("%a %b %d %X +0000 %Y" )
61
+ tweet .retweets = retweets
62
+ tweet .favorites = favorites
63
+ tweet .mentions = " " .join (re .compile ('(@\\ w*)' ).findall (tweet .text ))
64
+ tweet .hashtags = " " .join (re .compile ('(#\\ w*)' ).findall (tweet .text ))
65
+ tweet .geo = geo
66
+ tweet .urls = "," .join (urls )
67
+ tweet .author_id = user_id
68
+
69
+ results .append (tweet )
70
+ resultsAux .append (tweet )
71
+
72
+ if receiveBuffer and len (resultsAux ) >= bufferLength :
73
+ receiveBuffer (resultsAux )
74
+ resultsAux = []
75
+
76
+ if tweetCriteria .maxTweets > 0 and len (results ) >= tweetCriteria .maxTweets :
77
+ active = False
78
+ break
79
+
80
+
81
+ if receiveBuffer and len (resultsAux ) > 0 :
82
+ receiveBuffer (resultsAux )
83
+
84
+ return results
85
+
86
+ @staticmethod
87
+ def getJsonReponse (tweetCriteria , refreshCursor , cookieJar ):
88
+ url = "https://twitter.com/i/search/timeline?f=realtime&q=%s&src=typd&%smax_position=%s"
89
+
90
+ urlGetData = ''
91
+ if hasattr (tweetCriteria , 'username' ):
92
+ urlGetData += ' from:' + tweetCriteria .username
93
+
94
+ if hasattr (tweetCriteria , 'since' ):
95
+ urlGetData += ' since:' + tweetCriteria .since
96
+
97
+ if hasattr (tweetCriteria , 'until' ):
98
+ urlGetData += ' until:' + tweetCriteria .until
99
+
100
+ if hasattr (tweetCriteria , 'querySearch' ):
101
+ urlGetData += ' ' + tweetCriteria .querySearch
102
+
103
+ if hasattr (tweetCriteria , 'lang' ):
104
+ urlLang = 'lang=' + tweetCriteria .lang + '&'
105
+ else :
106
+ urlLang = ''
107
+ url = url % (urllib .parse .quote (urlGetData ), urlLang , refreshCursor )
108
+ #print(url)
109
+
110
+ headers = [
111
+ ('Host' , "twitter.com" ),
112
+ ('User-Agent' , "Mozilla/5.0 (Windows NT 6.1; Win64; x64)" ),
113
+ ('Accept' , "application/json, text/javascript, */*; q=0.01" ),
114
+ ('Accept-Language' , "de,en-US;q=0.7,en;q=0.3" ),
115
+ ('X-Requested-With' , "XMLHttpRequest" ),
116
+ ('Referer' , url ),
117
+ ('Connection' , "keep-alive" )
118
+ ]
119
+
120
+ opener = urllib .request .build_opener (urllib .request .HTTPCookieProcessor (cookieJar ))
121
+ opener .addheaders = headers
122
+
123
+ try :
124
+ response = opener .open (url )
125
+ jsonResponse = response .read ()
126
+ except :
127
+ #print("Twitter weird response. Try to see on browser: ", url)
128
+ print ("Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd" % urllib .parse .quote (urlGetData ))
129
+ print ("Unexpected error:" , sys .exc_info ()[0 ])
130
+ sys .exit ()
131
+ return
132
+
133
+ dataJson = json .loads (jsonResponse .decode ())
134
+
135
+ return dataJson
0 commit comments