Skip to content

Commit 8c13a1d

Browse files
author
Maximilian Karl
authored
Merge pull request #56 from TUBAF-IFI-DiPiT/repository
feat: Add Repository class
2 parents 437fecc + 183b7ea commit 8c13a1d

3 files changed

Lines changed: 251 additions & 0 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ The package implements Python functions for
1313
```
1414
|-- My_Github_Repository_0 <- Repository name
1515
| |- Repo.json <- Json file containing user and repo name
16+
| |- Repository
17+
| | |- Repository.p
1618
| |- Issues
1719
| | |- pdIssuesComments.p
1820
| | |- pdIssuesEvents.p

github2pandas/repository.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
import pandas as pd
2+
from pathlib import Path
3+
from github import GithubException
4+
from .utility import Utility
5+
6+
class Repository(object):
7+
"""
8+
Class to aggregate Workflows
9+
10+
Attributes
11+
----------
12+
REPOSITORY_DIR : str
13+
repository dir where all files are saved in.
14+
REPOSITORY : str
15+
Pandas table file for basic repository data.
16+
17+
Methods
18+
-------
19+
extract_repository_data(repo, contributor_companies_included = False):
20+
Extracting general repository data.
21+
22+
23+
"""
24+
25+
REPOSITORY_DIR = "Repository"
26+
REPOSITORY = "pdRepository.p"
27+
28+
@staticmethod
29+
def extract_repository_data(repo, contributor_companies_included = False):
30+
"""
31+
extract_repository_data(repo, contributor_companies_included)
32+
33+
Extracting general repository data.
34+
35+
Parameters
36+
----------
37+
repo : Repository
38+
Repository object from pygithub.
39+
40+
contributor_companies_included: bool default False
41+
Starts evaluation of contributor affiliations (huge effort in large projects).
42+
43+
Returns
44+
-------
45+
dict
46+
Dictionary with the extracted data.
47+
48+
Notes
49+
-----
50+
PyGithub Workflow object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Workflow.html
51+
52+
"""
53+
repository_data = {}
54+
55+
repo_name = repo.full_name.split('/')[-1]
56+
user_name = repo.url.split('/')[-2]
57+
58+
commits = repo.get_commits()
59+
try:
60+
# problem: No commits in repo
61+
last_commit_date = pd.to_datetime(commits[0].commit.committer.date , format="%Y-%m-%d M:%S")
62+
except GithubException:
63+
print("No commits found!")
64+
65+
contributor = repo.get_contributors( 'all')
66+
try:
67+
# problem: history or contributor is too large to list them via the API.
68+
contributors_count = len (list (contributor))
69+
except GithubException:
70+
print("Too many contributors, not covered by API!")
71+
contributors_count = 999999
72+
73+
companies = []
74+
if contributor_companies_included:
75+
for contributor in contributor:
76+
try:
77+
companies.append(contributor.company)
78+
except GithubException:
79+
print('Contributor does not exist anymore')
80+
continue
81+
filtered_companies = list(filter(None.__ne__, companies))
82+
83+
try:
84+
# problem: readme.md does not exist
85+
readme_content = repo.get_readme().content
86+
except GithubException:
87+
readme_content = ""
88+
print("Readme does not exist")
89+
# problem: sometimes get_readme outputs a NoneType result
90+
if readme_content is None:
91+
readme_length = 0
92+
print("Readme does not exist")
93+
else:
94+
readme_length = len(readme_content)
95+
96+
try:
97+
# problem: empty list of tags
98+
tag_count = repo.get_tags().totalCount
99+
except GithubException:
100+
tag_count = 0
101+
print("No tags assigned to repository")
102+
103+
try:
104+
# problem: organization entry empty
105+
organization_name = repo.organization.name
106+
repo_type = repo.organization.type
107+
except:
108+
organization_name = "not known"
109+
repo_type = "not known"
110+
print("Organization not valid")
111+
112+
try:
113+
# problem: no pull request comments
114+
pulls_review_comments = repo.get_pulls_review_comments().totalCount
115+
except GithubException:
116+
pulls_review_comments = "not known"
117+
print("No pull request comments")
118+
119+
repository_data = {
120+
'repo_name': repo_name,
121+
'organization_name' : organization_name,
122+
'repo_type' : repo_type,
123+
'user_name': user_name,
124+
'creation_date': pd.to_datetime(repo.created_at, format="%Y-%m-%d %H:%M:%S"),
125+
'stars': repo.stargazers_count,
126+
'size': repo.size,
127+
'contributor_count': contributors_count,
128+
'contributor_companies': filtered_companies,
129+
'contributor_companies_count': len(filtered_companies),
130+
'repo_url': repo.url,
131+
'repo_html_url':repo.html_url,
132+
'branch_count': repo.get_branches().totalCount,
133+
'commit_count': commits.totalCount,
134+
'commit_comment_count': repo.get_comments().totalCount,
135+
'last_commit_date': last_commit_date,
136+
'labels_count': repo.get_labels().totalCount,
137+
'tag_count': tag_count,
138+
'milestone_count': repo.get_milestones(state="all").totalCount,
139+
'pullrequest_count': repo.get_pulls(state="all").totalCount,
140+
'pullrequest_review_count': pulls_review_comments,
141+
'release_count': repo.get_releases().totalCount,
142+
'workflow_count': repo.get_workflows().totalCount,
143+
'readme_length': readme_length,
144+
'issues_count': repo.get_issues(state="all").totalCount,
145+
'issues_comment_count': repo.get_issues_comments().totalCount,
146+
'has_wiki': bool(repo.has_wiki),
147+
'has_pages': bool(repo.has_pages),
148+
'has_projects': bool(repo.has_projects),
149+
'has_downloads': bool(repo.has_downloads),
150+
'watchers_count': bool(repo.watchers_count),
151+
'is_fork': repo.fork,
152+
}
153+
return repository_data
154+
155+
156+
@staticmethod
157+
def generate_repository_pandas_table(repo, data_root_dir, contributor_companies_included = False):
158+
"""
159+
generate_repository_pandas_table(repo, data_root_dir, contributor_companies_included = False)
160+
161+
Extracting the basic repository data.
162+
163+
Parameters
164+
----------
165+
repo : Repository
166+
Repository object from pygithub.
167+
data_root_dir : str
168+
Data root directory for the repository.
169+
contributor_companies_included: bool default False
170+
Starts evaluation of contributor affiliations (huge effort in large projects).
171+
172+
Notes
173+
-----
174+
PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
175+
176+
"""
177+
178+
repository_dir = Path(data_root_dir, Repository.REPOSITORY_DIR)
179+
repository_dir.mkdir(parents=True, exist_ok=True)
180+
181+
repository_data = Repository.extract_repository_data(repo, contributor_companies_included)
182+
183+
repository_data_list = []
184+
repository_data_list.append(repository_data)
185+
Utility.save_list_to_pandas_table(repository_dir, Repository.REPOSITORY, repository_data_list)
186+
187+
@staticmethod
188+
def get_repository_keyparameter(data_root_dir, filename=REPOSITORY):
189+
"""
190+
get_repository_keyparameter(data_root_dir, filename=REPOSITORY)
191+
192+
Get a generated pandas tables.
193+
194+
Parameters
195+
----------
196+
data_root_dir : str
197+
Data root directory for the repository.
198+
filename : str, default=REPOSITORY
199+
Pandas table file for workflows or workflows runs data.
200+
201+
Returns
202+
-------
203+
DataFrame
204+
Pandas DataFrame which can include the desired data.
205+
206+
"""
207+
repository_dir = Path(data_root_dir, Repository.REPOSITORY_DIR)
208+
pd_repository_file = Path(repository_dir, filename)
209+
if pd_repository_file.is_file():
210+
return pd.read_pickle(pd_repository_file)
211+
else:
212+
return pd.DataFrame()

tests/test_repository.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import unittest
2+
import os
3+
from pathlib import Path
4+
import datetime
5+
import shutil
6+
7+
from github2pandas.repository import Repository
8+
from github2pandas.utility import Utility
9+
10+
class TestRepositories(unittest.TestCase):
11+
"""
12+
Test case for Repository class.
13+
"""
14+
15+
github_token = os.environ['TOKEN']
16+
17+
git_repo_name = "github2pandas"
18+
git_repo_owner = "TUBAF-IFI-DiPiT"
19+
20+
default_data_folder = Path("test_data", git_repo_name)
21+
repo = Utility.get_repo(git_repo_owner, git_repo_name, github_token, default_data_folder)
22+
23+
def test_generate_workflow_pandas_tables(self):
24+
Repository.generate_repository_pandas_table(self.repo, self.default_data_folder, contributor_companies_included = True)
25+
Repository.generate_repository_pandas_table(self.repo, self.default_data_folder, contributor_companies_included = False)
26+
27+
def test_get_workflows(self):
28+
pd_repository = Repository.get_repository_keyparameter(self.default_data_folder)
29+
30+
def setUp(self):
31+
self.default_data_folder.mkdir(parents=True, exist_ok=True)
32+
33+
def tearDown(self):
34+
shutil.rmtree("test_data")
35+
36+
if "__main__" == __name__:
37+
unittest.main()

0 commit comments

Comments
 (0)