Skip to content
Merged

Ra #77

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/report_test_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ on:
jobs:
report-test-coverage:
runs-on: ubuntu-latest
if: ${{ github.event.workflow_run.conclusion == 'success' }}
steps:
- name: "Download artifact"
uses: actions/github-script@v6
Expand Down
212 changes: 149 additions & 63 deletions .github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Any, Dict, List

Expand All @@ -10,17 +9,48 @@
from requests_toolbelt import MultipartEncoder


@dataclass
class Contributor:
class Counter(dict):
"""
Dataclass for a github contributor.

Args:
name (str): name of the contributor
num_commits_this_week (int): number of commits made within one week
"""
name: str
num_commits_this_week: int

def record(self, item: str):
if item in self:
self[item] += 1
else:
self[item] = 1

def to_sorted_list(self):
data = [(key, value) for key, value in self.items()]
data.sort(key=lambda x: x[1], reverse=True)
return data


def get_utc_time_one_week_ago():
"""
Get the UTC time one week ago.
"""
now = datetime.utcnow()
start_datetime = now - timedelta(days=7)
return start_datetime


def datetime2str(dt):
"""
Convert datetime to string in the format of YYYY-MM-DDTHH:MM:SSZ
"""
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")


def str2datetime(string):
"""
Convert string in the format of YYYY-MM-DDTHH:MM:SSZ to datetime
"""
return datetime.strptime(string, "%Y-%m-%dT%H:%M:%SZ")


def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title: str, output_path: str) -> None:
Expand All @@ -36,7 +66,28 @@ def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title:
plt.savefig(output_path, dpi=1200)


def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str, int]:
def get_organization_repositories(github_token, organization_name) -> List[str]:
"""
Retrieve the public repositories under the organization.
"""
url = f"https://api.github.com/orgs/{organization_name}/repos?type=public"

# prepare header
headers = {
'Authorization': f'Bearer {github_token}',
'Accept': 'application/vnd.github+json',
'X-GitHub-Api-Version': '2022-11-28'
}

res = requests.get(url, headers=headers).json()
repo_list = []

for item in res:
repo_list.append(item['name'])
return repo_list


def get_issue_pull_request_comments(github_token: str, org_name: str, repo_name: str, since: str) -> Dict[str, int]:
"""
Retrieve the issue/PR comments made by our members in the last 7 days.

Expand All @@ -56,7 +107,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
# do pagination to the API
page = 1
while True:
comment_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/comments?since={since}&page={page}'
comment_api = f'https://api.github.com/repos/{org_name}/{repo_name}/issues/comments?since={since}&page={page}'
comment_response = requests.get(comment_api, headers=headers).json()

if len(comment_response) == 0:
Expand All @@ -70,7 +121,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
continue

issue_id = item['issue_url'].split('/')[-1]
issue_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/{issue_id}'
issue_api = f'https://api.github.com/repos/{org_name}/{repo_name}/issues/{issue_id}'
issue_response = requests.get(issue_api, headers=headers).json()
issue_author_relationship = issue_response['author_association']

Expand All @@ -87,7 +138,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
return user_engagement_count


def get_discussion_comments(github_token, since) -> Dict[str, int]:
def get_discussion_comments(github_token: str, org_name: str, repo_name: str, since: str) -> Dict[str, int]:
"""
Retrieve the discussion comments made by our members in the last 7 days.
This is only available via the GitHub GraphQL API.
Expand All @@ -105,7 +156,7 @@ def _generate_discussion_query(num, cursor: str = None):
offset_str = f", after: \"{cursor}\""
query = f"""
{{
repository(owner: "hpcaitech", name: "ColossalAI"){{
repository(owner: "{org_name}", name: "{repo_name}"){{
discussions(first: {num} {offset_str}){{
edges {{
cursor
Expand Down Expand Up @@ -134,7 +185,7 @@ def _generate_comment_reply_count_for_discussion(discussion_number, num, cursor:
offset_str = f", before: \"{cursor}\""
query = f"""
{{
repository(owner: "hpcaitech", name: "ColossalAI"){{
repository(owner: "{org_name}", name: "{repo_name}"){{
discussion(number: {discussion_number}){{
title
comments(last: {num} {offset_str}){{
Expand Down Expand Up @@ -191,8 +242,8 @@ def _call_graphql_api(query):
for edge in edges:
# print the discussion title
discussion = edge['node']
discussion_updated_at = str2datetime(discussion['updatedAt'])

discussion_updated_at = datetime.strptime(discussion['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
# check if the updatedAt is within the last 7 days
# if yes, add it to discussion_numbers
if discussion_updated_at > since:
Expand Down Expand Up @@ -250,6 +301,7 @@ def _call_graphql_api(query):
if reply['authorAssociation'] == 'MEMBER':
# check if the updatedAt is within the last 7 days
# if yes, add it to discussion_numbers

reply_updated_at = datetime.strptime(reply['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
if reply_updated_at > since:
member_name = reply['author']['login']
Expand All @@ -260,7 +312,7 @@ def _call_graphql_api(query):
return user_engagement_count


def generate_user_engagement_leaderboard_image(github_token: str, output_path: str) -> bool:
def generate_user_engagement_leaderboard_image(github_token: str, org_name: str, repo_list: List[str], output_path: str) -> bool:
"""
Generate the user engagement leaderboard image for stats within the last 7 days

Expand All @@ -270,23 +322,29 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
"""

# request to the Github API to get the users who have replied the most in the last 7 days
now = datetime.utcnow()
start_datetime = now - timedelta(days=7)
start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
start_datetime = get_utc_time_one_week_ago()
start_datetime_str = datetime2str(start_datetime)

# get the issue/PR comments and discussion comment count
issue_pr_engagement_count = get_issue_pull_request_comments(github_token=github_token, since=start_datetime_str)
discussion_engagement_count = get_discussion_comments(github_token=github_token, since=start_datetime)
total_engagement_count = {}

# update the total engagement count
total_engagement_count.update(issue_pr_engagement_count)
for name, count in discussion_engagement_count.items():
if name in total_engagement_count:
total_engagement_count[name] += count
else:
total_engagement_count[name] = count
def _update_count(counter):
for name, count in counter.items():
if name in total_engagement_count:
total_engagement_count[name] += count
else:
total_engagement_count[name] = count


for repo_name in repo_list:
print(f"Fetching user engagement count for {repo_name}/{repo_name}")
issue_pr_engagement_count = get_issue_pull_request_comments(github_token=github_token, org_name=org_name, repo_name=repo_name, since=start_datetime_str)
discussion_engagement_count = get_discussion_comments(github_token=github_token, org_name=org_name, repo_name=repo_name, since=start_datetime)

# update the total engagement count
_update_count(issue_pr_engagement_count)
_update_count(discussion_engagement_count)

# prepare the data for plotting
x = []
y = []
Expand All @@ -302,9 +360,6 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
x.append(count)
y.append(name)

# use Shanghai time to display on the image
start_datetime_str = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%dT%H:%M:%SZ")

# plot the leaderboard
xlabel = f"Number of Comments made (since {start_datetime_str})"
ylabel = "Member"
Expand All @@ -315,7 +370,7 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
return False


def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
def generate_contributor_leaderboard_image(github_token, org_name, repo_list, output_path) -> bool:
"""
Generate the contributor leaderboard image for stats within the last 7 days

Expand All @@ -324,54 +379,81 @@ def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
output_path (str): the path to save the image
"""
# request to the Github API to get the users who have contributed in the last 7 days
URL = 'https://api.github.com/repos/hpcaitech/ColossalAI/stats/contributors'
headers = {
'Authorization': f'Bearer {github_token}',
'Accept': 'application/vnd.github+json',
'X-GitHub-Api-Version': '2022-11-28'
}

while True:
response = requests.get(URL, headers=headers).json()
counter = Counter()
start_datetime = get_utc_time_one_week_ago()

if len(response) != 0:
# sometimes the Github API returns empty response for unknown reason
# request again if the response is empty
break
def _get_url(org_name, repo_name, page):
return f'https://api.github.com/repos/{org_name}/{repo_name}/pulls?per_page=50&page={page}&state=closed'

def _iterate_by_page(org_name, repo_name):
page = 1
stop = False

while not stop:
print(f"Fetching pull request data for {org_name}/{repo_name} - page{page}")
url = _get_url(org_name, repo_name, page)

contributor_list = []
while True:
response = requests.get(url, headers=headers).json()

# get number of commits for each contributor
start_timestamp = None
for item in response:
num_commits_this_week = item['weeks'][-1]['c']
name = item['author']['login']
contributor = Contributor(name=name, num_commits_this_week=num_commits_this_week)
contributor_list.append(contributor)
if isinstance(response, list):
# sometimes the Github API returns nothing
# request again if the response is not a list
break
print("Empty response, request again...")

# update start_timestamp
start_timestamp = item['weeks'][-1]['w']
if len(response) == 0:
# if the response is empty, stop
stop = True
break

# count the pull request and author from response
for pr_data in response:
merged_at = pr_data['merged_at']
author = pr_data['user']['login']

if merged_at is None:
continue

merge_datetime = str2datetime(merged_at)

if merge_datetime < start_datetime:
# if we found a pull request that is merged before the start_datetime
# we stop
stop = True
break
else:
# record the author1
counter.record(author)

# next page
page += 1

for repo_name in repo_list:
_iterate_by_page(org_name, repo_name)

# convert unix timestamp to Beijing datetime
start_datetime = datetime.fromtimestamp(start_timestamp, tz=pytz.timezone('Asia/Shanghai'))
start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
bj_start_datetime = datetime.fromtimestamp(start_datetime.timestamp(), tz=pytz.timezone('Asia/Shanghai'))
bj_start_datetime_str = datetime2str(bj_start_datetime)

# sort by number of commits
contributor_list.sort(key=lambda x: x.num_commits_this_week, reverse=True)
contribution_list = counter.to_sorted_list()

# remove contributors who has zero commits
contributor_list = [x for x in contributor_list if x.num_commits_this_week > 0]

# prepare the data for plotting
x = [x.num_commits_this_week for x in contributor_list]
y = [x.name for x in contributor_list]
author_list = [x[0] for x in contribution_list]
num_commit_list = [x[1] for x in contribution_list]

# plot
if len(x) > 0:
xlabel = f"Number of Commits (since {start_datetime_str})"
if len(author_list) > 0:
xlabel = f"Number of Pull Requests (since {bj_start_datetime_str})"
ylabel = "Contributor"
title = 'Active Contributor Leaderboard'
plot_bar_chart(x, y, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
plot_bar_chart(num_commit_list, author_list, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
return True
else:
return False
Expand Down Expand Up @@ -438,10 +520,14 @@ def send_message_to_lark(message: str, webhook_url: str):
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
CONTRIBUTOR_IMAGE_PATH = 'contributor_leaderboard.png'
USER_ENGAGEMENT_IMAGE_PATH = 'engagement_leaderboard.png'
ORG_NAME = "hpcaitech"

# get all open source repositories
REPO_LIST = get_organization_repositories(GITHUB_TOKEN, ORG_NAME)

# generate images
contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, CONTRIBUTOR_IMAGE_PATH)
engagement_success = generate_user_engagement_leaderboard_image(GITHUB_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, ORG_NAME, REPO_LIST, CONTRIBUTOR_IMAGE_PATH)
engagement_success = generate_user_engagement_leaderboard_image(GITHUB_TOKEN, ORG_NAME, REPO_LIST, USER_ENGAGEMENT_IMAGE_PATH)

# upload images
APP_ID = os.environ['LARK_APP_ID']
Expand All @@ -457,8 +543,8 @@ def send_message_to_lark(message: str, webhook_url: str):
2. 用户互动榜单

注:
- 开发贡献者测评标准为:本周由公司成员提交的commit次数
- 用户互动榜单测评标准为:本周由公司成员在非成员创建的issue/PR/discussion中回复的次数
- 开发贡献者测评标准为:本周由公司成员与社区在所有开源仓库提交的Pull Request次数
- 用户互动榜单测评标准为:本周由公司成员在非成员在所有开源仓库创建的issue/PR/discussion中回复的次数
"""

send_message_to_lark(message, LARK_WEBHOOK_URL)
Expand All @@ -467,7 +553,7 @@ def send_message_to_lark(message: str, webhook_url: str):
if contrib_success:
send_image_to_lark(contributor_image_key, LARK_WEBHOOK_URL)
else:
send_message_to_lark("本周没有成员贡献commit,无榜单图片生成。", LARK_WEBHOOK_URL)
send_message_to_lark("本周没有成员贡献PR,无榜单图片生成。", LARK_WEBHOOK_URL)

# send user engagement image to lark
if engagement_success:
Expand Down
4 changes: 2 additions & 2 deletions applications/Chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ More details can be found in the latest news.
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
</p>

> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --max_timesteps 1 --update_timesteps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32
> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --num_collect_steps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32

## Install

Expand Down Expand Up @@ -287,7 +287,7 @@ If you only have a single 24G GPU, you can use the following script. `batch_size
torchrun --standalone --nproc_per_node=1 train_sft.py \
--pretrain "/path/to/LLaMa-7B/" \
--model 'llama' \
--strategy naive \
--strategy ddp \
--log_interval 10 \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
Expand Down
Loading