diff --git a/Issue.py b/Issue.py new file mode 100644 index 0000000..f7bcb85 --- /dev/null +++ b/Issue.py @@ -0,0 +1,36 @@ +class IssueComment(object): + + + author = "" + title = "" + summary = "" + date = "" + + def getAuthor(self): + return self.author + + def getTitle(self): + return self.title + + def getSummary(self): + return self.summary + + def getDate(self): + return self.date + + + +class Issue(IssueComment): + + status = "" + comments = [] + id = -1 + + def getId(self): + return self.id + + def getStatus(self): + return self.status + + def getCommnets(self): + return self.comments \ No newline at end of file diff --git a/Project.py b/Project.py new file mode 100644 index 0000000..c2ad7da --- /dev/null +++ b/Project.py @@ -0,0 +1,26 @@ +from enum import enum + +REPO_TYPES = enum("SVN", "git", "hg", "NA") + +class Project(object): + + repoURL = "" + releases = [] + issues = [] + wikis = [] + repoType = REPO_TYPES.NA + + def getRepoURL(self): + return self.repoURL + + def getReleases(self): + return self.releases + + def getIssues(self): + return self.issues + + def getRepoType(self): + return self.repoType + + def getWikis(self): + return self.wikis \ No newline at end of file diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..492879a --- /dev/null +++ b/README.txt @@ -0,0 +1,52 @@ +# codescrape + +Version 1.0 + +By: Nathan Adams + +License: MIT + +## Description + +This library is to be used to archive project data. Since with the announcement of Google Code going to archive only - I wanted to create a library where you can grab source data before it is gone forever. + +Use cases include: + +Archive projects due to: + +- Hosting service shutting down +- Authorities sending cease-and-desist against provider/project +- Historical/research/ or educational purposes + +## Usage + +Currently srchub and google code are supported. To use: + + from services.srchub import srchub + shub = srchub() + projects = shub.getProjects() + +or for google code + + from services.googlecode import googlecode + gcode = googlecode() + project = gcode.getProject("android-python27") + +Sourcehub library will pull all public projects since this list is easily accessed. Google Code does not have a public list persay. And I didn't want to scrape the search results, so I developed it to require you to pass in the project name. If you were to get your hands on a list of google code projects you could easily loop through them: + + from services.googlecode import googlecode + gcode = googlecode() + for project in someProjectList: + project = gcode.getProject(project) + # do something with project + +the project data structure is as follows: + +project + +- getRepoURL() -> Returns the URL of the repo +- getRepoType() -> Returns the type of repo (git, hg, or SVN) +- getReleases() -> Returns all downloads related to the project +- getIssues() -> Returns open issues +- getWikis() -> Returns wikis + diff --git a/Release.py b/Release.py new file mode 100644 index 0000000..ecdacc6 --- /dev/null +++ b/Release.py @@ -0,0 +1,18 @@ +class Release(object): + + fileName = "" + summary = "" + fileURL = "" + checksum = None + + def getFileName(self): + return self.fileName + + def getSummary(self): + return self.summary + + def getFileURL(self): + return self.fileURL + + def getChecksum(self): + return self.checksum \ No newline at end of file diff --git a/Service.py b/Service.py new file mode 100644 index 0000000..d079db8 --- /dev/null +++ b/Service.py @@ -0,0 +1,41 @@ +import pycurl +try: + from cStringIO import StringIO +except ImportError: + try: + from StringIO import StringIO + except ImportError: + from io import StringIO +from urllib import urlencode + +class Service(object): + + def getProjects(self): + pass + + def curl_post(self, url, postvals, header = []): + buffer = StringIO() + cobj = pycurl.Curl() + cobj.setopt(pycurl.URL, url) + cobj.setopt(pycurl.SSL_VERIFYPEER, 0) + cobj.setopt(pycurl.SSL_VERIFYHOST, 0) + cobj.setopt(pycurl.POST, 1) + cobj.setopt(pycurl.WRITEDATA, buffer) + postdata = urlencode(postvals) + cobj.setopt(pycurl.POSTFIELDS, postdata) + cobj.setopt(pycurl.HTTPHEADER, header) + cobj.perform() + cobj.close() + return buffer + + def curl_get(self, url, header = []): + buffer = StringIO() + cobj = pycurl.Curl() + cobj.setopt(pycurl.SSL_VERIFYPEER, 0) + cobj.setopt(pycurl.SSL_VERIFYHOST, 0) + cobj.setopt(pycurl.URL, url) + cobj.setopt(pycurl.WRITEDATA, buffer) + cobj.setopt(pycurl.HTTPHEADER, header) + cobj.perform() + cobj.close() + return buffer \ No newline at end of file diff --git a/Wiki.py b/Wiki.py new file mode 100644 index 0000000..ef64928 --- /dev/null +++ b/Wiki.py @@ -0,0 +1,22 @@ +class Wiki(object): + + pageName = "" + htmlContent = "" + textContent = "" + summary = "" + updated = "" + + def getPageName(self): + return self.pageName + + def getHTMLCotnent(self): + return self.htmlContent + + def getTextContent(self): + return self.textContent + + def getSummary(self): + return self.summary + + def getUpdated(self): + return self.updated \ No newline at end of file diff --git a/enum.py b/enum.py new file mode 100644 index 0000000..0eaf725 --- /dev/null +++ b/enum.py @@ -0,0 +1,7 @@ +# Pythonic way to do enums: +# http://stackoverflow.com/a/1695250/195722 +def enum(*sequential, **named): + enums = dict(zip(sequential, range(len(sequential))), **named) + reverse = dict((value, key) for key, value in enums.iteritems()) + enums['val'] = reverse + return type('Enum', (), enums) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..9a51622 --- /dev/null +++ b/main.py @@ -0,0 +1,8 @@ +from services.srchub import srchub +from services.googlecode import googlecode + +#shub = srchub() + +#projects = shub.getProjects() +gcode = googlecode() +project = gcode.getProject("android-python27") diff --git a/services/__init__.py b/services/__init__.py new file mode 100644 index 0000000..ee0f945 --- /dev/null +++ b/services/__init__.py @@ -0,0 +1 @@ +__author__ = 'nathan' diff --git a/services/googlecode.py b/services/googlecode.py new file mode 100644 index 0000000..6383e46 --- /dev/null +++ b/services/googlecode.py @@ -0,0 +1,113 @@ +from Service import Service +from Project import REPO_TYPES, Project +from Release import Release +from Issue import IssueComment, Issue +from Wiki import Wiki +from bs4 import BeautifulSoup + + +class googlecode(Service): + + DOMAIN = "https://code.google.com" + + # Since I want to stay on Google's good side + # I'm going to write this method to parse a single project + # You will need to provide your own project list to roll through + # Such a list exists (although incomplete) + # http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2 + def getProject(self, projectName): + project = Project() + sourceType = None + projectURL = self.DOMAIN + "/p/" + projectName + "/" + + projectpageHTML = self.curl_get(projectURL).getvalue() + projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser") + + sourceURL = projectpageSoup.find(name="a", string="Source").get("href") + sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser") + sourceSoupText = sourceSoup.get_text() + + # get source + if "git clone" in sourceSoupText: + project.repoType = REPO_TYPES.git + project.repoURL = "https://code.google.com/p/" + projectName + "/" + elif "svn co" in sourceSoupText: + project.repoType = REPO_TYPES.SVN + project.repoURL = "http://" + projectName + ".googlecode.com/svn/" + else: + project.repoType = REPO_TYPES.hg + project.repoURL = "https://code.google.com/p/" + projectName + "/" + + + # get downloads + project.releases = [] + downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser") + downloadSection = downlaodsSoup.find("table", "results") + if "Your search did not generate any results." not in downlaodsSoup.get_text(): + downloadRows = downloadSection.find_all("tr")[1:] + for downloadRow in downloadRows: + cols = downloadRow.find_all("td") + downloadTD = cols[1] + downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ") + fileName = downloadTD.a.text.replace("\n", "").strip(" ") + release = Release() + release.fileURL = downloadURL + release.fileName = fileName + project.releases.append(release) + + # get issues + project.issues = [] + issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser") + if "Your search did not generate any results." not in issuesSoup.get_text(): + issuesSection = issuesSoup.find("table", "results") + for issueRow in issuesSection.find_all("tr")[1:]: + issue = Issue() + cols = issueRow.find_all("td") + issueId = cols[1].text.replace("\n", "").strip() + issueURL = projectURL + "issues/detail?id=" + issueId + issueStatus = cols[3].text.replace("\n", "").strip(" ") + issueSummary = cols[8].text.replace("\n", "") + issueTitle = cols[8].text.replace("\n", "") + issueAuthor = cols[5].text.replace("\n", "") + + #issue.author = issueAuthor + issue.comments = [] + issue.status = issueStatus.strip(" ") + issue.summary = issueSummary.strip(" ") + issue.title = issueTitle + issue.id = issueId + + # we must go deeper to get comments + issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser") + for comment in issueComments.find_all("div", "vt"): + #author = comment.find(class_="author").find("a").text + author = (comment.find(class_="author").find_all("a")[-1]).contents + date = comment.find("span", "date")["title"] + commentText = comment.find("pre").get_text() + issueComment = IssueComment() + issueComment.date = date + issueComment.author = author + issueComment.summary = commentText + issue.comments.append(issueComment) + + project.issues.append(issue) + + # get wiki pages + project.wikis = [] + wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser") + if "Your search did not generate any results." not in wikiSoup.get_text(): + wikiSection = wikiSoup.find("table", "results") + for wikiRow in wikiSection.find_all("tr")[1:]: + wiki = Wiki() + cols = wikiRow.find_all("td") + wiki.pageName = cols[1].text.replace("\n", "").strip(" ") + wiki.summary = cols[2].text.replace("\n", "").strip(" ") + wiki.updated = cols[3].text.replace("\n", "").strip(" ") + wikiURL = projectURL + "wiki/" + wiki.pageName + wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser") + wikiContent = wikiPageSoup.find(id="wikicontent") + wiki.htmlContent = wikiContent.prettify() + wiki.textContent = wikiContent.get_text() + project.wikis.append(wiki) + + return project \ No newline at end of file diff --git a/services/srchub.py b/services/srchub.py new file mode 100644 index 0000000..0929a3e --- /dev/null +++ b/services/srchub.py @@ -0,0 +1,119 @@ +from Service import Service +from Project import REPO_TYPES, Project +from Release import Release +from Issue import IssueComment, Issue +from Wiki import Wiki +from bs4 import BeautifulSoup + +class srchub(Service): + + DOMAIN = "https://beta.datanethost.net" + + def getProjects(self): + # Perhaps I should provide more API endpoints to make scraping easier... + projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue() + soup = BeautifulSoup(projectlist, "html.parser") + links = soup.find("ul", "prjlistclass") + projects = [] + for link in links.find_all("a"): + project = Project() + sourceType = None + projectURL = self.DOMAIN + link.get("href") + projectName = projectURL.split("/")[-2] + + projectpageHTML = self.curl_get(projectURL).getvalue() + projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser") + + sourceURL = projectpageSoup.find(name="a", string="Source").get("href") + sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser") + sourceSoupText = sourceSoup.get_text() + + # get source + if "git clone" in sourceSoupText: + project.repoType = REPO_TYPES.git + project.repoURL = "git://" + self.DOMAIN + "/" + projectName + ".git" + elif "svn co" in sourceSoupText: + project.repoType = REPO_TYPES.SVN + project.repoURL = "https://" + self.DOMAIN + "/svn/" + projectName + "/" + else: + project.repoType = REPO_TYPES.hg + project.repoURL = "https://" + self.DOMAIN + "/hg/" + projectName + "/" + + + # get downloads + project.releases = [] + downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser") + downloadSection = downlaodsSoup.find("table", "uploads") + if "No downloads were found." not in downlaodsSoup.get_text(): + downloadRows = downloadSection.find_all("tr")[1:] + for downloadRow in downloadRows: + cols = downloadRow.find_all("td") + downloadTD = cols[0] + downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text + fileName = downloadTD.a.text + release = Release() + release.fileURL = downloadURL + release.fileName = fileName + project.releases.append(release) + + # get issues + project.issues = [] + issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser") + if "No issues were found." not in issuesSoup.get_text(): + issuesSection = issuesSoup.find("table", "recent-issues") + for issueRow in issuesSection.find_all("tr")[1:]: + issue = Issue() + cols = issueRow.find_all("td") + issueId = cols[0].text + issueURL = projectURL + "issues/" + issueId + "/" + issueStatus = cols[2].text + issueSummary = cols[1].text + issueTitle = cols[1].find("a").text + issueAuthor = cols[3].text + issue.author = issueAuthor + issue.comments = [] + issue.status = issueStatus + issue.summary = issueSummary + issue.title = issueTitle + issue.id = issueId + # we must go deeper to get comments + issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser") + for comment in issueComments.find_all("div", "issue-comment"): + author = comment.find("p").get_text().split("by")[1].split(",")[0] + date = comment.find("span").get_text() + commentText = comment.find("pre").get_text() + issueComment = IssueComment() + issueComment.date = date + issueComment.author = author + issueComment.summary = commentText + issue.comments.append(issueComment) + + project.issues.append(issue) + + # get wiki pages + project.wikis = [] + wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser") + if "No documentation pages were found." not in wikiSoup.get_text(): + wikiSection = wikiSoup.find("table", "recent-issues") + for wikiRow in wikiSection.find_all("tr")[1:]: + wiki = Wiki() + cols = wikiRow.find_all("td") + wiki.pageName = cols[0].text + wiki.summary = cols[1].text + wiki.updated = cols[2].text + wikiURL = projectURL + "page/" + wiki.pageName + "/" + wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser") + wikiContent = wikiPageSoup.find(id="wiki-content") + wiki.htmlContent = wikiContent.prettify() + wiki.textContent = wikiContent.get_text() + project.wikis.append(wiki) + + + projects.append(project) + + return projects + + + + +