From 54ed5846fea9caa899387affde17183190fa69f7 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 2 May 2025 13:41:58 +0900 Subject: [PATCH 01/17] fix Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- src/client.py | 185 +++++++++++++++++++++++++++++++++++++++++++++++++- src/dump.py | 22 +++--- src/models.py | 8 +-- 3 files changed, 199 insertions(+), 16 deletions(-) diff --git a/src/client.py b/src/client.py index 07591d6d..7495e23d 100644 --- a/src/client.py +++ b/src/client.py @@ -37,7 +37,8 @@ def get_paginate(self, end_point, params=None): while True: logger.info(f"{end_point} {page}") res = self.get( - end_point, params={**(params or {}), "page": page, "per_page": self.per_page} + end_point, + params={**(params or {}), "page": page, "per_page": self.per_page}, ) yield from res if len(res) < self.per_page: @@ -135,3 +136,185 @@ def get_discussions(self, owner, repo): after = page_info["endCursor"] if not page_info["hasNextPage"]: break + + def get_issues_graphql(self, owner, repo): + query = """ +query { + repository(owner: "%s", name: "%s") { + issues(first: %d, states: [OPEN, CLOSED], orderBy: {field: CREATED_AT, direction: ASC}) { + totalCount + pageInfo { + endCursor + hasNextPage + } + nodes { + id + number + title + body + state + closedAt + createdAt + updatedAt + url + author { + login + ... on User { id } + } + } + } + } +} +""" % ( + owner, + repo, + self.per_page, + # state, + ) + + query_with_cursor = """ +query { + repository(owner: "%s", name: "%s") { + issues(first: %d, states: [OPEN, CLOSED], after: "AFTER", orderBy: {field: CREATED_AT, direction: ASC}) { + totalCount + pageInfo { + endCursor + hasNextPage + } + nodes { + id + number + title + body + state + closedAt + createdAt + updatedAt + url + author { + login + ... on User { id } + } + } + } + } +} +""" % ( + owner, + repo, + self.per_page, + ) + after = None + while True: + q = query if after is None else query_with_cursor.replace("AFTER", after) + data = self.run_graphql_query(q) + issues = data["data"]["repository"]["issues"] + for node in issues["nodes"]: + # Normalize author and pullRequest for compatibility with models.py + if node["author"] and "id" in node["author"]: + node["user"] = { + "id": node["author"]["id"], + "login": node["author"]["login"], + } + else: + node["user"] = { + "id": 0, + "login": node["author"]["login"] if node["author"] else None, + } + node["pullRequest"] = False + yield node + page_info = issues["pageInfo"] + after = page_info["endCursor"] + if not page_info["hasNextPage"]: + break + + def get_pulls_graphql(self, owner, repo): + query = """ +query { + repository(owner: "%s", name: "%s") { + pullRequests(first: %d, states: [OPEN, CLOSED], orderBy: {field: CREATED_AT, direction: ASC}) { + totalCount + pageInfo { + endCursor + hasNextPage + } + nodes { + id + number + title + body + state + closedAt + createdAt + updatedAt + url + author { + login + ... on User { id } + } + } + } + } +} +""" % ( + owner, + repo, + self.per_page, + # state, + ) + + query_with_cursor = """ +query { + repository(owner: "%s", name: "%s") { + pullRequests(first: %d, states: [OPEN, CLOSED], after: "AFTER", orderBy: {field: CREATED_AT, direction: ASC}) { + totalCount + pageInfo { + endCursor + hasNextPage + } + nodes { + id + number + title + body + state + closedAt + createdAt + updatedAt + url + author { + login + ... on User { id } + } + } + } + } +} +""" % ( + owner, + repo, + self.per_page, + ) + after = None + while True: + q = query if after is None else query_with_cursor.replace("AFTER", after) + data = self.run_graphql_query(q) + pulls = data["data"]["repository"]["pullRequests"] + for node in pulls["nodes"]: + # Normalize author and pullRequest for compatibility with models.py + if node["author"] and "id" in node["author"]: + node["user"] = { + "id": node["author"]["id"], + "login": node["author"]["login"], + } + else: + node["user"] = { + "id": 0, + "login": node["author"]["login"] if node["author"] else None, + } + node["pullRequest"] = True + yield node + page_info = pulls["pageInfo"] + after = page_info["endCursor"] + if not page_info["hasNextPage"]: + break diff --git a/src/dump.py b/src/dump.py index 745121e7..abed4d56 100644 --- a/src/dump.py +++ b/src/dump.py @@ -44,7 +44,7 @@ def main(): g = GitHubApiClient(per_page=100) pprint(g.get_rate_limit()) since = datetime(1970, 1, 1) - # since = datetime(2022, 7, 1) + # since = datetime(2025, 4, 1) logger.info("Collecting commits") commits = g.get_commits( @@ -61,22 +61,22 @@ def main(): logger.info("Collecting mlflow org members") mlflow_org_members = set( - HashableDict(id=m["id"], login=m["login"]) for m in g.get_organization_members("mlflow") + HashableDict(id=m["id"], login=m["login"]) + for m in g.get_organization_members("mlflow") ) collaborators = set( - HashableDict(id=c["id"], login=c["login"]) for c in g.get_collaborators(*repo) + HashableDict(id=c["id"], login=c["login"]) + for c in g.get_collaborators(*repo) + ) + session.add_all( + M.MlflowOrgMember.from_gh_objects(mlflow_org_members.union(collaborators)) ) - session.add_all(M.MlflowOrgMember.from_gh_objects(mlflow_org_members.union(collaborators))) logger.info("Collecting issues") - issues = g.get_issues( - *repo, - params={ - "state": "all", - "since": since, - }, - ) + issues = g.get_issues_graphql(*repo) + pulls = g.get_pulls_graphql(*repo) session.add_all(M.Issue.from_gh_objects(issues)) + session.add_all(M.Issue.from_gh_objects(pulls)) logger.info("Collecting discussions") discussions = g.get_discussions(*repo) diff --git a/src/models.py b/src/models.py index 6a0d3f78..7eb3c716 100644 --- a/src/models.py +++ b/src/models.py @@ -143,10 +143,10 @@ def from_gh_object(cls, issue): body=issue["body"], state=issue["state"], closed_at=closed_at and parse_datetime(closed_at), - created_at=parse_datetime(issue["created_at"]), - updated_at=parse_datetime(issue["updated_at"]), - html_url=issue["html_url"], - is_pr="pull_request" in issue, + created_at=parse_datetime(issue["createdAt"]), + updated_at=parse_datetime(issue["updatedAt"]), + html_url=issue["url"], + is_pr=issue.get("pullRequest", False), ) From 940cc1b7e3eba35ce333b68fafc39b5ead63a331 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 2 May 2025 13:44:14 +0900 Subject: [PATCH 02/17] test Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- .github/workflows/build.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 0406a743..867b2a53 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -6,6 +6,7 @@ on: push: branches: - master + pull_request: workflow_dispatch: permissions: @@ -26,6 +27,7 @@ jobs: - name: Install dependencies run: pip install -r requirements.txt - name: Run builder + if: github.event_name != 'pull_request' run: | python src/dump.py python src/build.py @@ -33,6 +35,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.HARUPY_GITHUB_TOKEN }} - name: Deploy 🚀 uses: JamesIves/github-pages-deploy-action@v4 + if: github.event_name != 'pull_request' with: branch: gh-pages folder: dist From fa186c5483ab95ed807b97a04c651ce217ecd706 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 2 May 2025 13:45:49 +0900 Subject: [PATCH 03/17] fix Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- .github/workflows/build.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 867b2a53..16ffc59d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -27,7 +27,6 @@ jobs: - name: Install dependencies run: pip install -r requirements.txt - name: Run builder - if: github.event_name != 'pull_request' run: | python src/dump.py python src/build.py From c8febeb4dd49f0cec46c668d82f3bb0ddd1037b0 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 2 May 2025 13:48:55 +0900 Subject: [PATCH 04/17] a Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- src/client.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/client.py b/src/client.py index 7495e23d..4d4d68fe 100644 --- a/src/client.py +++ b/src/client.py @@ -205,7 +205,10 @@ def get_issues_graphql(self, owner, repo): self.per_page, ) after = None + page = 0 while True: + page += 1 + logger.info(f"Issues page {page}") q = query if after is None else query_with_cursor.replace("AFTER", after) data = self.run_graphql_query(q) issues = data["data"]["repository"]["issues"] @@ -296,7 +299,10 @@ def get_pulls_graphql(self, owner, repo): self.per_page, ) after = None + page = 0 while True: + page += 1 + logger.info(f"Pulls page {page}") q = query if after is None else query_with_cursor.replace("AFTER", after) data = self.run_graphql_query(q) pulls = data["data"]["repository"]["pullRequests"] From cda5648460bf03aa06ce329485a13c08deeaafa7 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 2 May 2025 14:01:15 +0900 Subject: [PATCH 05/17] fix Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- src/build.py | 2 +- src/client.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/build.py b/src/build.py index 6024e428..4b675bc4 100644 --- a/src/build.py +++ b/src/build.py @@ -357,7 +357,7 @@ def main(): ) opened_pulls = opened_pulls[(opened_pulls._merge == "left_only")].drop("_merge", axis=1) opened_pulls_by_month = count_by_month(opened_pulls, "created_at") - closed_pulls = opened_pulls[opened_pulls["state"] == "closed"] + closed_pulls = opened_pulls[opened_pulls["state"] == "closed" | opened_pulls["state"] == "merged"] closed_pulls_by_month = count_by_month(closed_pulls, "closed_at") pulls_non_maintainers_plot_path = plots_dir.joinpath("pulls_non_maintainers.html") make_plot( diff --git a/src/client.py b/src/client.py index 4d4d68fe..0edee415 100644 --- a/src/client.py +++ b/src/client.py @@ -225,6 +225,7 @@ def get_issues_graphql(self, owner, repo): "login": node["author"]["login"] if node["author"] else None, } node["pullRequest"] = False + node["state"] = node["state"].lower() yield node page_info = issues["pageInfo"] after = page_info["endCursor"] @@ -235,7 +236,7 @@ def get_pulls_graphql(self, owner, repo): query = """ query { repository(owner: "%s", name: "%s") { - pullRequests(first: %d, states: [OPEN, CLOSED], orderBy: {field: CREATED_AT, direction: ASC}) { + pullRequests(first: %d, states: [OPEN, CLOSED, MERGED], orderBy: {field: CREATED_AT, direction: ASC}) { totalCount pageInfo { endCursor @@ -269,7 +270,7 @@ def get_pulls_graphql(self, owner, repo): query_with_cursor = """ query { repository(owner: "%s", name: "%s") { - pullRequests(first: %d, states: [OPEN, CLOSED], after: "AFTER", orderBy: {field: CREATED_AT, direction: ASC}) { + pullRequests(first: %d, states: [OPEN, CLOSED, MERGED], after: "AFTER", orderBy: {field: CREATED_AT, direction: ASC}) { totalCount pageInfo { endCursor @@ -319,6 +320,7 @@ def get_pulls_graphql(self, owner, repo): "login": node["author"]["login"] if node["author"] else None, } node["pullRequest"] = True + node["state"] = node["state"].lower() yield node page_info = pulls["pageInfo"] after = page_info["endCursor"] From 9141d4687183256c433acb6890fda170491b09e8 Mon Sep 17 00:00:00 2001 From: harupy <17039389+harupy@users.noreply.github.com> Date: Fri, 2 May 2025 14:09:26 +0900 Subject: [PATCH 06/17] fix --- src/build.py | 79 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/src/build.py b/src/build.py index 4b675bc4..fa7374e2 100644 --- a/src/build.py +++ b/src/build.py @@ -24,7 +24,9 @@ def count_by_month(df, datetime_col): .pipe( lambda df_: ( df_.set_index( - df_.index.map(lambda year_month: datetime(year_month[0], year_month[1], 1)) + df_.index.map( + lambda year_month: datetime(year_month[0], year_month[1], 1) + ) ) ) ) @@ -136,13 +138,13 @@ def main(): x_tick_vals=x_tick_vals, x_axis_range=x_axis_range, y_axis_range=get_y_axis_range( - contributors_by_month[contributors_by_month["date"] >= year_ago]["count"] + contributors_by_month[contributors_by_month["date"] >= year_ago][ + "count" + ] ), ).write_html(contributors_plot_path, include_plotlyjs="cdn") - commits_url_template = ( - "https://github.com/mlflow/mlflow/commits?author={author}&since={since}&until={until}" - ) + commits_url_template = "https://github.com/mlflow/mlflow/commits?author={author}&since={since}&until={until}" anchor_template = '{text}' six_month_ago = now - relativedelta(months=6) active_contributors = ( @@ -167,11 +169,15 @@ def main(): ) .assign( user=lambda df: df.apply( - lambda row: anchor_template.format(url=row["user_url"], text=row["user_login"]), + lambda row: anchor_template.format( + url=row["user_url"], text=row["user_login"] + ), axis=1, ), PRs=lambda df: df.apply( - lambda row: anchor_template.format(url=row["commits"], text=row["PRs"]), + lambda row: anchor_template.format( + url=row["commits"], text=row["PRs"] + ), axis=1, ), ) @@ -197,7 +203,9 @@ def main(): first_commits = raw_commits.sort_values("date").groupby("user_name").head(1) total_contributors_by_month = count_by_month(first_commits, "date") - total_contributors_by_month["count"] = total_contributors_by_month["count"].cumsum() + total_contributors_by_month["count"] = total_contributors_by_month[ + "count" + ].cumsum() total_contributors_path = plots_dir.joinpath("total_contributors.html") make_plot( go.Scatter( @@ -209,9 +217,9 @@ def main(): x_tick_vals=x_tick_vals, x_axis_range=x_axis_range, y_axis_range=get_y_axis_range( - total_contributors_by_month[total_contributors_by_month["date"] >= year_ago][ - "count" - ] + total_contributors_by_month[ + total_contributors_by_month["date"] >= year_ago + ]["count"] ), ).write_html(total_contributors_path, include_plotlyjs="cdn") @@ -306,8 +314,12 @@ def main(): x_tick_vals=x_tick_vals, x_axis_range=x_axis_range, y_axis_range=get_y_axis_range( - opened_issues_by_month[opened_issues_by_month["date"] >= year_ago]["count"], - closed_issues_by_month[closed_issues_by_month["date"] >= year_ago]["count"], + opened_issues_by_month[opened_issues_by_month["date"] >= year_ago][ + "count" + ], + closed_issues_by_month[closed_issues_by_month["date"] >= year_ago][ + "count" + ], ), ).write_html(issues_plot_path, include_plotlyjs="cdn") @@ -319,9 +331,14 @@ def main(): how="outer", indicator=True, ) - opened_pulls = opened_pulls[(opened_pulls._merge == "both")].drop("_merge", axis=1) + opened_pulls = opened_pulls[(opened_pulls._merge == "both")].drop( + "_merge", axis=1 + ) opened_pulls_by_month = count_by_month(opened_pulls, "created_at") - closed_pulls = opened_pulls[opened_pulls["state"] == "closed"] + closed_pulls = opened_pulls[ + opened_pulls["state"] == "closed" | opened_pulls["state"] == "merged" + ] + print(closed_pulls) closed_pulls_by_month = count_by_month(closed_pulls, "closed_at") pulls_maintainers_plot_path = plots_dir.joinpath("pulls_all.html") make_plot( @@ -341,8 +358,12 @@ def main(): x_tick_vals=x_tick_vals, x_axis_range=x_axis_range, y_axis_range=get_y_axis_range( - opened_pulls_by_month[opened_pulls_by_month["date"] >= year_ago]["count"], - closed_pulls_by_month[closed_pulls_by_month["date"] >= year_ago]["count"], + opened_pulls_by_month[opened_pulls_by_month["date"] >= year_ago][ + "count" + ], + closed_pulls_by_month[closed_pulls_by_month["date"] >= year_ago][ + "count" + ], ), ).write_html(pulls_maintainers_plot_path, include_plotlyjs="cdn") @@ -355,11 +376,17 @@ def main(): how="outer", indicator=True, ) - opened_pulls = opened_pulls[(opened_pulls._merge == "left_only")].drop("_merge", axis=1) + opened_pulls = opened_pulls[(opened_pulls._merge == "left_only")].drop( + "_merge", axis=1 + ) opened_pulls_by_month = count_by_month(opened_pulls, "created_at") - closed_pulls = opened_pulls[opened_pulls["state"] == "closed" | opened_pulls["state"] == "merged"] + closed_pulls = opened_pulls[ + opened_pulls["state"] == "closed" | opened_pulls["state"] == "merged" + ] closed_pulls_by_month = count_by_month(closed_pulls, "closed_at") - pulls_non_maintainers_plot_path = plots_dir.joinpath("pulls_non_maintainers.html") + pulls_non_maintainers_plot_path = plots_dir.joinpath( + "pulls_non_maintainers.html" + ) make_plot( go.Scatter( x=opened_pulls_by_month["date"], @@ -377,8 +404,12 @@ def main(): x_tick_vals=x_tick_vals, x_axis_range=x_axis_range, y_axis_range=get_y_axis_range( - opened_pulls_by_month[opened_pulls_by_month["date"] >= year_ago]["count"], - closed_pulls_by_month[closed_pulls_by_month["date"] >= year_ago]["count"], + opened_pulls_by_month[opened_pulls_by_month["date"] >= year_ago][ + "count" + ], + closed_pulls_by_month[closed_pulls_by_month["date"] >= year_ago][ + "count" + ], ), ).write_html(pulls_non_maintainers_plot_path, include_plotlyjs="cdn") @@ -441,7 +472,9 @@ def main(): iframes = [] for plot in plots: iframes.append(iframe_html_template.format(src=plot.relative_to(dist_dir))) - plots_html += '