import urllib as ul
import urllib2 as ul2
import re
import codecs
import time
import mwclient
import twitter
from logindetails import username as identicausername
from logindtails import password as identicapassword
enwp = "en.wikipedia.org"
dyk = "Template:Did you know"
lastrevcontentsfile = "/home/briannalaugher/dyk2identica.modernthings.org/code/lastrevcontents.txt"
lastrevidfile = "/home/briannalaugher/dyk2identica.modernthings.org/code/lastrevid.txt"
redyk = re.compile("\{\{\*mp\}\}\.\.\.(.*?\?) ?\n")
renewpage = re.compile("'''(.*?)'''")
repipelink = re.compile("(\[\[(.*?)\|(.*?)\]\])")
repipelink2 = re.compile("(\[\[[^[]*?\|(.*?)\]\])")
repictured = re.compile("('?'?\((?:.*?)pictured(?:.*?)\)'?'? ?)")
relink = re.compile("\[\[(.*?)\]\]")
updatedelay = 3.5*60
def get_last_rev_info():
f = codecs.open(lastrevcontentsfile,'r','utf-8')
contents = f.read()
f.close()
g = open(lastrevidfile,'r')
id = int(g.read().strip())
g.close()
return (id,contents)
def update_last_rev_info(newid,newcontents):
f = codecs.open(lastrevcontentsfile,'w','utf-8')
f.write(newcontents)
f.close()
g = open(lastrevidfile,'w')
g.write(str(newid))
g.close()
return
def get_entries_from_wp(lastrevid):
site = mwclient.Site(enwp)
page = site.Pages[dyk]
newestid = page.revision
dyks = []
ids = []
for r in page.revisions(startid=newestid, endid=lastrevid, prop='ids|content'):
if r['revid'] == newestid:
newestcontent = r['*']
if r['revid'] == lastrevid:
break
dyks += redyk.findall(r['*'])
ids.append(r['revid'])
update_last_rev_info(newestid,newestcontent)
return dyks
def fix_links(text):
bits = repipelink2.split(text)
notpipedbits = [b for b in bits if not ("|" in b and "[[" in b)]
newtext = "".join(notpipedbits)
newtext = newtext.replace("[[","")
newtext = newtext.replace("]]","")
newtext = newtext.replace("<nowiki>","")
newtext = newtext.replace("</nowiki>","")
newtext = newtext.replace(" "," ")
return newtext
def handle_bolded_link(text):
if text.startswith("{{") and text.endswith("}}"):
newtext = text[2:-2].replace("|"," ")
return (newtext, newtext)
m = repipelink.search(text)
if m:
targetpage = m.groups(1)[1]
linktext = m.groups(1)[2]
(start,end) = m.span()
else:
m2 = relink.search(text)
if m2:
targetpage = m2.groups(1)[0]
linktext = targetpage
(start,end) = m2.span()
else:
print text
raise NoNewPageLinkError
newtext = text[:start] + linktext + text[end:]
return (newtext, targetpage)
def remove_pictured_aside(text):
picturebits = repictured.split(text)
if len(picturebits) == 1:
newsentence = picturebits[0]
elif len(picturebits) == 3:
newsentence = picturebits[0] + picturebits[2]
else:
print newsentence
raise SomethingWeirdPicturedAsideError
return newsentence
def get_link(targetpage):
link = " http://enwp.org/" + targetpage.replace(" ","_")
return link
def mw_to_plaintext(dyks):
plaintexts = {}
for d in dyks:
newsentence = "DYK"
sentparts = renewpage.split(d)
if len(sentparts) < 3:
print d
raise TooManyOrFewBoldedLinksError
if len(sentparts) >= 3:
newsentence += fix_links(sentparts[0])
try:
(newsentencemiddle, targetpage1) = handle_bolded_link(sentparts[1])
except:
continue
newsentence += newsentencemiddle
newsentence += fix_links(sentparts[2])
if len(sentparts) == 5:
try:
(newsentence4th, targetpage2) = handle_bolded_link(sentparts[3])
except:
continue
newsentence += fix_links(sentparts[4])
link2 = get_link(targetpage2)
newsentence = remove_pictured_aside(newsentence)
newsentence = newsentence.replace("'''","")
newsentence = newsentence.replace("''","")
link1 = get_link(targetpage1)
newsentence += link1
if len(sentparts) == 5:
newsentence += link2
key = targetpage1.lower()
plaintexts[key] = newsentence
return plaintexts
def wp_to_identica(lastrevcontents,dyks):
lastrevdyks = redyk.findall(lastrevcontents)
dyks = list(set(dyks))
dyks = [d for d in dyks if d not in lastrevdyks]
lastupdates = mw_to_plaintext(lastrevdyks)
newupdates = mw_to_plaintext(dyks)
reallynew = []
for k in newupdates.keys():
if k not in lastupdates.keys():
reallynew.append(newupdates[k])
return reallynew
def post_to_identica(updates):
api = twitter.Api(username=identicausername, password=identicapassword)
for update in updates:
update = update.encode('ascii', 'ignore')
length = len(update)
if length < 160:
try:
status = api.PostUpdate(update)
except:
print "update failed"
print update
raise
else:
update1 = update[:length/2]
restbits = update[length/2:].split(" ",1)
if len(restbits) == 2:
update1 += restbits[0]
update2 = restbits[1]
else:
update2 = restbits[0]
update1 += "..."
update2 = "..." + update2
status1 = api.PostUpdate(update1)
status2 = api.PostUpdate(update2)
time.sleep(updatedelay)
return
def main():
(lastrevid,lastrevcontents) = get_last_rev_info()
dyks = get_entries_from_wp(lastrevid)
updates = wp_to_identica(lastrevcontents,dyks)
post_to_identica(updates)
return
if __name__ == "__main__":
main()