|
| 1 | +import pandas as pd |
| 2 | +from pathlib import Path |
| 3 | +from github import GithubException |
| 4 | +from .utility import Utility |
| 5 | + |
| 6 | +class Repository(object): |
| 7 | + """ |
| 8 | + Class to aggregate Workflows |
| 9 | +
|
| 10 | + Attributes |
| 11 | + ---------- |
| 12 | + REPOSITORY_DIR : str |
| 13 | + repository dir where all files are saved in. |
| 14 | + REPOSITORY : str |
| 15 | + Pandas table file for basic repository data. |
| 16 | +
|
| 17 | + Methods |
| 18 | + ------- |
| 19 | + extract_repository_data(repo, contributor_companies_included = False): |
| 20 | + Extracting general repository data. |
| 21 | + |
| 22 | +
|
| 23 | + """ |
| 24 | + |
| 25 | + REPOSITORY_DIR = "Repository" |
| 26 | + REPOSITORY = "pdRepository.p" |
| 27 | + |
| 28 | + @staticmethod |
| 29 | + def extract_repository_data(repo, contributor_companies_included = False): |
| 30 | + """ |
| 31 | + extract_repository_data(repo, contributor_companies_included) |
| 32 | +
|
| 33 | + Extracting general repository data. |
| 34 | +
|
| 35 | + Parameters |
| 36 | + ---------- |
| 37 | + repo : Repository |
| 38 | + Repository object from pygithub. |
| 39 | +
|
| 40 | + contributor_companies_included: bool default False |
| 41 | + Starts evaluation of contributor affiliations (huge effort in large projects). |
| 42 | +
|
| 43 | + Returns |
| 44 | + ------- |
| 45 | + dict |
| 46 | + Dictionary with the extracted data. |
| 47 | +
|
| 48 | + Notes |
| 49 | + ----- |
| 50 | + PyGithub Workflow object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Workflow.html |
| 51 | +
|
| 52 | + """ |
| 53 | + repository_data = {} |
| 54 | + |
| 55 | + repo_name = repo.full_name.split('/')[-1] |
| 56 | + user_name = repo.url.split('/')[-2] |
| 57 | + |
| 58 | + commits = repo.get_commits() |
| 59 | + try: |
| 60 | + # problem: No commits in repo |
| 61 | + last_commit_date = pd.to_datetime(commits[0].commit.committer.date , format="%Y-%m-%d M:%S") |
| 62 | + except GithubException: |
| 63 | + print("No commits found!") |
| 64 | + |
| 65 | + contributor = repo.get_contributors( 'all') |
| 66 | + try: |
| 67 | + # problem: history or contributor is too large to list them via the API. |
| 68 | + contributors_count = len (list (contributor)) |
| 69 | + except GithubException: |
| 70 | + print("Too many contributors, not covered by API!") |
| 71 | + contributors_count = 999999 |
| 72 | + |
| 73 | + companies = [] |
| 74 | + if contributor_companies_included: |
| 75 | + for contributor in contributor: |
| 76 | + try: |
| 77 | + companies.append(contributor.company) |
| 78 | + except GithubException: |
| 79 | + print('Contributor does not exist anymore') |
| 80 | + continue |
| 81 | + filtered_companies = list(filter(None.__ne__, companies)) |
| 82 | + |
| 83 | + try: |
| 84 | + # problem: readme.md does not exist |
| 85 | + readme_content = repo.get_readme().content |
| 86 | + except GithubException: |
| 87 | + readme_content = "" |
| 88 | + print("Readme does not exist") |
| 89 | + # problem: sometimes get_readme outputs a NoneType result |
| 90 | + if readme_content is None: |
| 91 | + readme_length = 0 |
| 92 | + print("Readme does not exist") |
| 93 | + else: |
| 94 | + readme_length = len(readme_content) |
| 95 | + |
| 96 | + try: |
| 97 | + # problem: empty list of tags |
| 98 | + tag_count = repo.get_tags().totalCount |
| 99 | + except GithubException: |
| 100 | + tag_count = 0 |
| 101 | + print("No tags assigned to repository") |
| 102 | + |
| 103 | + try: |
| 104 | + # problem: organization entry empty |
| 105 | + organization_name = repo.organization.name |
| 106 | + repo_type = repo.organization.type |
| 107 | + except: |
| 108 | + organization_name = "not known" |
| 109 | + repo_type = "not known" |
| 110 | + print("Organization not valid") |
| 111 | + |
| 112 | + try: |
| 113 | + # problem: no pull request comments |
| 114 | + pulls_review_comments = repo.get_pulls_review_comments().totalCount |
| 115 | + except GithubException: |
| 116 | + pulls_review_comments = "not known" |
| 117 | + print("No pull request comments") |
| 118 | + |
| 119 | + repository_data = { |
| 120 | + 'repo_name': repo_name, |
| 121 | + 'organization_name' : organization_name, |
| 122 | + 'repo_type' : repo_type, |
| 123 | + 'user_name': user_name, |
| 124 | + 'creation_date': pd.to_datetime(repo.created_at, format="%Y-%m-%d %H:%M:%S"), |
| 125 | + 'stars': repo.stargazers_count, |
| 126 | + 'size': repo.size, |
| 127 | + 'contributor_count': contributors_count, |
| 128 | + 'contributor_companies': filtered_companies, |
| 129 | + 'contributor_companies_count': len(filtered_companies), |
| 130 | + 'repo_url': repo.url, |
| 131 | + 'repo_html_url':repo.html_url, |
| 132 | + 'branch_count': repo.get_branches().totalCount, |
| 133 | + 'commit_count': commits.totalCount, |
| 134 | + 'commit_comment_count': repo.get_comments().totalCount, |
| 135 | + 'last_commit_date': last_commit_date, |
| 136 | + 'labels_count': repo.get_labels().totalCount, |
| 137 | + 'tag_count': tag_count, |
| 138 | + 'milestone_count': repo.get_milestones(state="all").totalCount, |
| 139 | + 'pullrequest_count': repo.get_pulls(state="all").totalCount, |
| 140 | + 'pullrequest_review_count': pulls_review_comments, |
| 141 | + 'release_count': repo.get_releases().totalCount, |
| 142 | + 'workflow_count': repo.get_workflows().totalCount, |
| 143 | + 'readme_length': readme_length, |
| 144 | + 'issues_count': repo.get_issues(state="all").totalCount, |
| 145 | + 'issues_comment_count': repo.get_issues_comments().totalCount, |
| 146 | + 'has_wiki': bool(repo.has_wiki), |
| 147 | + 'has_pages': bool(repo.has_pages), |
| 148 | + 'has_projects': bool(repo.has_projects), |
| 149 | + 'has_downloads': bool(repo.has_downloads), |
| 150 | + 'watchers_count': bool(repo.watchers_count), |
| 151 | + 'is_fork': repo.fork, |
| 152 | + } |
| 153 | + return repository_data |
| 154 | + |
| 155 | + |
| 156 | + @staticmethod |
| 157 | + def generate_repository_pandas_table(repo, data_root_dir, contributor_companies_included = False): |
| 158 | + """ |
| 159 | + generate_repository_pandas_table(repo, data_root_dir, contributor_companies_included = False) |
| 160 | +
|
| 161 | + Extracting the basic repository data. |
| 162 | +
|
| 163 | + Parameters |
| 164 | + ---------- |
| 165 | + repo : Repository |
| 166 | + Repository object from pygithub. |
| 167 | + data_root_dir : str |
| 168 | + Data root directory for the repository. |
| 169 | + contributor_companies_included: bool default False |
| 170 | + Starts evaluation of contributor affiliations (huge effort in large projects). |
| 171 | + |
| 172 | + Notes |
| 173 | + ----- |
| 174 | + PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html |
| 175 | + |
| 176 | + """ |
| 177 | + |
| 178 | + repository_dir = Path(data_root_dir, Repository.REPOSITORY_DIR) |
| 179 | + repository_dir.mkdir(parents=True, exist_ok=True) |
| 180 | + |
| 181 | + repository_data = Repository.extract_repository_data(repo, contributor_companies_included) |
| 182 | + |
| 183 | + repository_data_list = [] |
| 184 | + repository_data_list.append(repository_data) |
| 185 | + Utility.save_list_to_pandas_table(repository_dir, Repository.REPOSITORY, repository_data_list) |
| 186 | + |
| 187 | + @staticmethod |
| 188 | + def get_repository_keyparameter(data_root_dir, filename=REPOSITORY): |
| 189 | + """ |
| 190 | + get_repository_keyparameter(data_root_dir, filename=REPOSITORY) |
| 191 | +
|
| 192 | + Get a generated pandas tables. |
| 193 | +
|
| 194 | + Parameters |
| 195 | + ---------- |
| 196 | + data_root_dir : str |
| 197 | + Data root directory for the repository. |
| 198 | + filename : str, default=REPOSITORY |
| 199 | + Pandas table file for workflows or workflows runs data. |
| 200 | +
|
| 201 | + Returns |
| 202 | + ------- |
| 203 | + DataFrame |
| 204 | + Pandas DataFrame which can include the desired data. |
| 205 | +
|
| 206 | + """ |
| 207 | + repository_dir = Path(data_root_dir, Repository.REPOSITORY_DIR) |
| 208 | + pd_repository_file = Path(repository_dir, filename) |
| 209 | + if pd_repository_file.is_file(): |
| 210 | + return pd.read_pickle(pd_repository_file) |
| 211 | + else: |
| 212 | + return pd.DataFrame() |
0 commit comments