codescrape

codescrape Commit Details


Date:2015-09-08 21:56:08 (9 years 7 months ago)
Author:Natalie Adams
Branch:master
Commit:88d65df2bb567eea5fdd6e7b9f9b7375672cfb4d
Message:initial commit

Changes:

File differences

Issue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class IssueComment(object):
    author = ""
    title = ""
    summary = ""
    date = ""
    def getAuthor(self):
        return self.author
    def getTitle(self):
        return self.title
    def getSummary(self):
        return self.summary
    def getDate(self):
        return self.date
class Issue(IssueComment):
    status = ""
    comments = []
    id = -1
    def getId(self):
        return self.id
    def getStatus(self):
        return self.status
    def getCommnets(self):
        return self.comments
Project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from enum import enum
REPO_TYPES = enum("SVN", "git", "hg", "NA")
class Project(object):
    repoURL = ""
    releases = []
    issues = []
    wikis = []
    repoType = REPO_TYPES.NA
    def getRepoURL(self):
        return self.repoURL
    def getReleases(self):
        return self.releases
    def getIssues(self):
        return self.issues
    def getRepoType(self):
        return self.repoType
    def getWikis(self):
        return self.wikis
README.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# codescrape
Version 1.0
By: Nathan Adams
License: MIT
## Description
This library is to be used to archive project data. Since with the announcement of Google Code going to archive only - I wanted to create a library where you can grab source data before it is gone forever.
Use cases include:
Archive projects due to:
- Hosting service shutting down
- Authorities sending cease-and-desist against provider/project
- Historical/research/ or educational purposes
## Usage
Currently srchub and google code are supported. To use:
from services.srchub import srchub
shub = srchub()
projects = shub.getProjects()
or for google code
from services.googlecode import googlecode
gcode = googlecode()
project = gcode.getProject("android-python27")
Sourcehub library will pull all public projects since this list is easily accessed. Google Code does not have a public list persay. And I didn't want to scrape the search results, so I developed it to require you to pass in the project name. If you were to get your hands on a list of google code projects you could easily loop through them:
from services.googlecode import googlecode
gcode = googlecode()
for project in someProjectList:
project = gcode.getProject(project)
# do something with project
the project data structure is as follows:
project
- getRepoURL() -> Returns the URL of the repo
- getRepoType() -> Returns the type of repo (git, hg, or SVN)
- getReleases() -> Returns all downloads related to the project
- getIssues() -> Returns open issues
- getWikis() -> Returns wikis
Release.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
class Release(object):
    fileName = ""
    summary = ""
    fileURL = ""
    checksum = None
    def getFileName(self):
        return self.fileName
    def getSummary(self):
        return self.summary
    def getFileURL(self):
        return self.fileURL
    def getChecksum(self):
        return self.checksum
Service.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pycurl
try:
    from cStringIO import StringIO
except ImportError:
    try:
        from StringIO import StringIO
    except ImportError:
        from io import StringIO
from urllib import urlencode
class Service(object):
    def getProjects(self):
        pass
    def curl_post(self, url, postvals, header = []):
        buffer = StringIO()
        cobj = pycurl.Curl()
        cobj.setopt(pycurl.URL, url)
        cobj.setopt(pycurl.SSL_VERIFYPEER, 0)
        cobj.setopt(pycurl.SSL_VERIFYHOST, 0)
        cobj.setopt(pycurl.POST, 1)
        cobj.setopt(pycurl.WRITEDATA, buffer)
        postdata = urlencode(postvals)
        cobj.setopt(pycurl.POSTFIELDS, postdata)
        cobj.setopt(pycurl.HTTPHEADER, header)
        cobj.perform()
        cobj.close()
        return buffer
    def curl_get(self, url, header = []):
        buffer = StringIO()
        cobj = pycurl.Curl()
        cobj.setopt(pycurl.SSL_VERIFYPEER, 0)
        cobj.setopt(pycurl.SSL_VERIFYHOST, 0)
        cobj.setopt(pycurl.URL, url)
        cobj.setopt(pycurl.WRITEDATA, buffer)
        cobj.setopt(pycurl.HTTPHEADER, header)
        cobj.perform()
        cobj.close()
        return buffer
Wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
class Wiki(object):
    pageName = ""
    htmlContent = ""
    textContent = ""
    summary = ""
    updated = ""
    def getPageName(self):
        return self.pageName
    def getHTMLCotnent(self):
        return self.htmlContent
    def getTextContent(self):
        return self.textContent
    def getSummary(self):
        return self.summary
    def getUpdated(self):
        return self.updated
enum.py
1
2
3
4
5
6
7
# Pythonic way to do enums:
# http://stackoverflow.com/a/1695250/195722
def enum(*sequential, **named):
    enums = dict(zip(sequential, range(len(sequential))), **named)
    reverse = dict((value, key) for key, value in enums.iteritems())
    enums['val'] = reverse
    return type('Enum', (), enums)
main.py
1
2
3
4
5
6
7
8
from services.srchub import srchub
from services.googlecode import googlecode
#shub = srchub()
#projects = shub.getProjects()
gcode = googlecode()
project = gcode.getProject("android-python27")
services/__init__.py
1
__author__ = 'nathan'
services/googlecode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from Service import Service
from Project import REPO_TYPES, Project
from Release import Release
from Issue import IssueComment, Issue
from Wiki import Wiki
from bs4 import BeautifulSoup
class googlecode(Service):
    DOMAIN = "https://code.google.com"
    # Since I want to stay on Google's good side
    # I'm going to write this method to parse a single project
    # You will need to provide your own project list to roll through
    # Such a list exists (although incomplete)
    # http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2
    def getProject(self, projectName):
        project = Project()
        sourceType = None
        projectURL = self.DOMAIN + "/p/" + projectName + "/"
        projectpageHTML = self.curl_get(projectURL).getvalue()
        projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
        sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
        sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser")
        sourceSoupText = sourceSoup.get_text()
        # get source
        if "git clone" in sourceSoupText:
            project.repoType = REPO_TYPES.git
            project.repoURL = "https://code.google.com/p/" + projectName + "/"
        elif "svn co" in sourceSoupText:
            project.repoType = REPO_TYPES.SVN
            project.repoURL = "http://" + projectName + ".googlecode.com/svn/"
        else:
            project.repoType = REPO_TYPES.hg
            project.repoURL = "https://code.google.com/p/" + projectName + "/"
        # get downloads
        project.releases = []
        downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser")
        downloadSection = downlaodsSoup.find("table", "results")
        if "Your search did not generate any results." not in downlaodsSoup.get_text():
            downloadRows = downloadSection.find_all("tr")[1:]
            for downloadRow in downloadRows:
                cols = downloadRow.find_all("td")
                downloadTD = cols[1]
                downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ")
                fileName = downloadTD.a.text.replace("\n", "").strip(" ")
                release = Release()
                release.fileURL = downloadURL
                release.fileName = fileName
                project.releases.append(release)
        # get issues
        project.issues = []
        issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser")
        if "Your search did not generate any results." not in issuesSoup.get_text():
            issuesSection = issuesSoup.find("table", "results")
            for issueRow in issuesSection.find_all("tr")[1:]:
                issue = Issue()
                cols = issueRow.find_all("td")
                issueId = cols[1].text.replace("\n", "").strip()
                issueURL = projectURL + "issues/detail?id=" + issueId
                issueStatus = cols[3].text.replace("\n", "").strip(" ")
                issueSummary = cols[8].text.replace("\n", "")
                issueTitle = cols[8].text.replace("\n", "")
                issueAuthor = cols[5].text.replace("\n", "")
                #issue.author = issueAuthor
                issue.comments = []
                issue.status = issueStatus.strip(" ")
                issue.summary = issueSummary.strip(" ")
                issue.title = issueTitle
                issue.id = issueId
                # we must go deeper to get comments
                issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
                for comment in issueComments.find_all("div", "vt"):
                    #author = comment.find(class_="author").find("a").text
                    author = (comment.find(class_="author").find_all("a")[-1]).contents
                    date = comment.find("span", "date")["title"]
                    commentText = comment.find("pre").get_text()
                    issueComment = IssueComment()
                    issueComment.date = date
                    issueComment.author = author
                    issueComment.summary = commentText
                    issue.comments.append(issueComment)
                project.issues.append(issue)
        # get wiki pages
        project.wikis = []
        wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser")
        if "Your search did not generate any results." not in wikiSoup.get_text():
            wikiSection = wikiSoup.find("table", "results")
            for wikiRow in wikiSection.find_all("tr")[1:]:
                wiki = Wiki()
                cols = wikiRow.find_all("td")
                wiki.pageName = cols[1].text.replace("\n", "").strip(" ")
                wiki.summary = cols[2].text.replace("\n", "").strip(" ")
                wiki.updated = cols[3].text.replace("\n", "").strip(" ")
                wikiURL = projectURL + "wiki/" + wiki.pageName
                wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
                wikiContent = wikiPageSoup.find(id="wikicontent")
                wiki.htmlContent = wikiContent.prettify()
                wiki.textContent = wikiContent.get_text()
                project.wikis.append(wiki)
        return project
services/srchub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from Service import Service
from Project import REPO_TYPES, Project
from Release import Release
from Issue import IssueComment, Issue
from Wiki import Wiki
from bs4 import BeautifulSoup
class srchub(Service):
    DOMAIN = "https://beta.datanethost.net"
    def getProjects(self):
        # Perhaps I should provide more API endpoints to make scraping easier...
        projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue()
        soup = BeautifulSoup(projectlist, "html.parser")
        links = soup.find("ul", "prjlistclass")
        projects = []
        for link in links.find_all("a"):
            project = Project()
            sourceType = None
            projectURL = self.DOMAIN + link.get("href")
            projectName = projectURL.split("/")[-2]
            projectpageHTML = self.curl_get(projectURL).getvalue()
            projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
            sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
            sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser")
            sourceSoupText = sourceSoup.get_text()
            # get source
            if "git clone" in sourceSoupText:
                project.repoType = REPO_TYPES.git
                project.repoURL = "git://" + self.DOMAIN + "/" + projectName + ".git"
            elif "svn co" in sourceSoupText:
                project.repoType = REPO_TYPES.SVN
                project.repoURL = "https://" + self.DOMAIN + "/svn/" + projectName + "/"
            else:
                project.repoType = REPO_TYPES.hg
                project.repoURL = "https://" + self.DOMAIN + "/hg/" + projectName + "/"
            # get downloads
            project.releases = []
            downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser")
            downloadSection = downlaodsSoup.find("table", "uploads")
            if "No downloads were found." not in downlaodsSoup.get_text():
                downloadRows = downloadSection.find_all("tr")[1:]
                for downloadRow in downloadRows:
                    cols = downloadRow.find_all("td")
                    downloadTD = cols[0]
                    downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text
                    fileName = downloadTD.a.text
                    release = Release()
                    release.fileURL = downloadURL
                    release.fileName = fileName
                    project.releases.append(release)
            # get issues
            project.issues = []
            issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser")
            if "No issues were found." not in issuesSoup.get_text():
                issuesSection = issuesSoup.find("table", "recent-issues")
                for issueRow in issuesSection.find_all("tr")[1:]:
                    issue = Issue()
                    cols = issueRow.find_all("td")
                    issueId = cols[0].text
                    issueURL = projectURL + "issues/" + issueId + "/"
                    issueStatus = cols[2].text
                    issueSummary = cols[1].text
                    issueTitle = cols[1].find("a").text
                    issueAuthor = cols[3].text
                    issue.author = issueAuthor
                    issue.comments = []
                    issue.status = issueStatus
                    issue.summary = issueSummary
                    issue.title = issueTitle
                    issue.id = issueId
                    # we must go deeper to get comments
                    issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
                    for comment in issueComments.find_all("div", "issue-comment"):
                        author = comment.find("p").get_text().split("by")[1].split(",")[0]
                        date = comment.find("span").get_text()
                        commentText = comment.find("pre").get_text()
                        issueComment = IssueComment()
                        issueComment.date = date
                        issueComment.author = author
                        issueComment.summary = commentText
                        issue.comments.append(issueComment)
                    project.issues.append(issue)
            # get wiki pages
            project.wikis = []
            wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser")
            if "No documentation pages were found." not in wikiSoup.get_text():
                wikiSection = wikiSoup.find("table", "recent-issues")
                for wikiRow in wikiSection.find_all("tr")[1:]:
                    wiki = Wiki()
                    cols = wikiRow.find_all("td")
                    wiki.pageName = cols[0].text
                    wiki.summary = cols[1].text
                    wiki.updated = cols[2].text
                    wikiURL = projectURL + "page/" + wiki.pageName + "/"
                    wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
                    wikiContent = wikiPageSoup.find(id="wiki-content")
                    wiki.htmlContent = wikiContent.prettify()
                    wiki.textContent = wikiContent.get_text()
                    project.wikis.append(wiki)
            projects.append(project)
        return projects

Archive Download the corresponding diff file

Branches

Number of commits:
Page rendered in 0.27862s using 14 queries.