Skip to content
Merged

L #79

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/report_test_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ on:
jobs:
report-test-coverage:
runs-on: ubuntu-latest
if: ${{ github.event.workflow_run.conclusion == 'success' }}
steps:
- name: "Download artifact"
uses: actions/github-script@v6
Expand Down
212 changes: 149 additions & 63 deletions .github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Any, Dict, List

Expand All @@ -10,17 +9,48 @@
from requests_toolbelt import MultipartEncoder


@dataclass
class Contributor:
class Counter(dict):
"""
Dataclass for a github contributor.

Args:
name (str): name of the contributor
num_commits_this_week (int): number of commits made within one week
"""
name: str
num_commits_this_week: int

def record(self, item: str):
if item in self:
self[item] += 1
else:
self[item] = 1

def to_sorted_list(self):
data = [(key, value) for key, value in self.items()]
data.sort(key=lambda x: x[1], reverse=True)
return data


def get_utc_time_one_week_ago():
"""
Get the UTC time one week ago.
"""
now = datetime.utcnow()
start_datetime = now - timedelta(days=7)
return start_datetime


def datetime2str(dt):
"""
Convert datetime to string in the format of YYYY-MM-DDTHH:MM:SSZ
"""
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")


def str2datetime(string):
"""
Convert string in the format of YYYY-MM-DDTHH:MM:SSZ to datetime
"""
return datetime.strptime(string, "%Y-%m-%dT%H:%M:%SZ")


def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title: str, output_path: str) -> None:
Expand All @@ -36,7 +66,28 @@ def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title:
plt.savefig(output_path, dpi=1200)


def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str, int]:
def get_organization_repositories(github_token, organization_name) -> List[str]:
"""
Retrieve the public repositories under the organization.
"""
url = f"https://api.github.com/orgs/{organization_name}/repos?type=public"

# prepare header
headers = {
'Authorization': f'Bearer {github_token}',
'Accept': 'application/vnd.github+json',
'X-GitHub-Api-Version': '2022-11-28'
}

res = requests.get(url, headers=headers).json()
repo_list = []

for item in res:
repo_list.append(item['name'])
return repo_list


def get_issue_pull_request_comments(github_token: str, org_name: str, repo_name: str, since: str) -> Dict[str, int]:
"""
Retrieve the issue/PR comments made by our members in the last 7 days.

Expand All @@ -56,7 +107,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
# do pagination to the API
page = 1
while True:
comment_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/comments?since={since}&page={page}'
comment_api = f'https://api.github.com/repos/{org_name}/{repo_name}/issues/comments?since={since}&page={page}'
comment_response = requests.get(comment_api, headers=headers).json()

if len(comment_response) == 0:
Expand All @@ -70,7 +121,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
continue

issue_id = item['issue_url'].split('/')[-1]
issue_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/{issue_id}'
issue_api = f'https://api.github.com/repos/{org_name}/{repo_name}/issues/{issue_id}'
issue_response = requests.get(issue_api, headers=headers).json()
issue_author_relationship = issue_response['author_association']

Expand All @@ -87,7 +138,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
return user_engagement_count


def get_discussion_comments(github_token, since) -> Dict[str, int]:
def get_discussion_comments(github_token: str, org_name: str, repo_name: str, since: str) -> Dict[str, int]:
"""
Retrieve the discussion comments made by our members in the last 7 days.
This is only available via the GitHub GraphQL API.
Expand All @@ -105,7 +156,7 @@ def _generate_discussion_query(num, cursor: str = None):
offset_str = f", after: \"{cursor}\""
query = f"""
{{
repository(owner: "hpcaitech", name: "ColossalAI"){{
repository(owner: "{org_name}", name: "{repo_name}"){{
discussions(first: {num} {offset_str}){{
edges {{
cursor
Expand Down Expand Up @@ -134,7 +185,7 @@ def _generate_comment_reply_count_for_discussion(discussion_number, num, cursor:
offset_str = f", before: \"{cursor}\""
query = f"""
{{
repository(owner: "hpcaitech", name: "ColossalAI"){{
repository(owner: "{org_name}", name: "{repo_name}"){{
discussion(number: {discussion_number}){{
title
comments(last: {num} {offset_str}){{
Expand Down Expand Up @@ -191,8 +242,8 @@ def _call_graphql_api(query):
for edge in edges:
# print the discussion title
discussion = edge['node']
discussion_updated_at = str2datetime(discussion['updatedAt'])

discussion_updated_at = datetime.strptime(discussion['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
# check if the updatedAt is within the last 7 days
# if yes, add it to discussion_numbers
if discussion_updated_at > since:
Expand Down Expand Up @@ -250,6 +301,7 @@ def _call_graphql_api(query):
if reply['authorAssociation'] == 'MEMBER':
# check if the updatedAt is within the last 7 days
# if yes, add it to discussion_numbers

reply_updated_at = datetime.strptime(reply['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
if reply_updated_at > since:
member_name = reply['author']['login']
Expand All @@ -260,7 +312,7 @@ def _call_graphql_api(query):
return user_engagement_count


def generate_user_engagement_leaderboard_image(github_token: str, output_path: str) -> bool:
def generate_user_engagement_leaderboard_image(github_token: str, org_name: str, repo_list: List[str], output_path: str) -> bool:
"""
Generate the user engagement leaderboard image for stats within the last 7 days

Expand All @@ -270,23 +322,29 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
"""

# request to the Github API to get the users who have replied the most in the last 7 days
now = datetime.utcnow()
start_datetime = now - timedelta(days=7)
start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
start_datetime = get_utc_time_one_week_ago()
start_datetime_str = datetime2str(start_datetime)

# get the issue/PR comments and discussion comment count
issue_pr_engagement_count = get_issue_pull_request_comments(github_token=github_token, since=start_datetime_str)
discussion_engagement_count = get_discussion_comments(github_token=github_token, since=start_datetime)
total_engagement_count = {}

# update the total engagement count
total_engagement_count.update(issue_pr_engagement_count)
for name, count in discussion_engagement_count.items():
if name in total_engagement_count:
total_engagement_count[name] += count
else:
total_engagement_count[name] = count
def _update_count(counter):
for name, count in counter.items():
if name in total_engagement_count:
total_engagement_count[name] += count
else:
total_engagement_count[name] = count


for repo_name in repo_list:
print(f"Fetching user engagement count for {repo_name}/{repo_name}")
issue_pr_engagement_count = get_issue_pull_request_comments(github_token=github_token, org_name=org_name, repo_name=repo_name, since=start_datetime_str)
discussion_engagement_count = get_discussion_comments(github_token=github_token, org_name=org_name, repo_name=repo_name, since=start_datetime)

# update the total engagement count
_update_count(issue_pr_engagement_count)
_update_count(discussion_engagement_count)

# prepare the data for plotting
x = []
y = []
Expand All @@ -302,9 +360,6 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
x.append(count)
y.append(name)

# use Shanghai time to display on the image
start_datetime_str = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%dT%H:%M:%SZ")

# plot the leaderboard
xlabel = f"Number of Comments made (since {start_datetime_str})"
ylabel = "Member"
Expand All @@ -315,7 +370,7 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
return False


def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
def generate_contributor_leaderboard_image(github_token, org_name, repo_list, output_path) -> bool:
"""
Generate the contributor leaderboard image for stats within the last 7 days

Expand All @@ -324,54 +379,81 @@ def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
output_path (str): the path to save the image
"""
# request to the Github API to get the users who have contributed in the last 7 days
URL = 'https://api.github.com/repos/hpcaitech/ColossalAI/stats/contributors'
headers = {
'Authorization': f'Bearer {github_token}',
'Accept': 'application/vnd.github+json',
'X-GitHub-Api-Version': '2022-11-28'
}

while True:
response = requests.get(URL, headers=headers).json()
counter = Counter()
start_datetime = get_utc_time_one_week_ago()

if len(response) != 0:
# sometimes the Github API returns empty response for unknown reason
# request again if the response is empty
break
def _get_url(org_name, repo_name, page):
return f'https://api.github.com/repos/{org_name}/{repo_name}/pulls?per_page=50&page={page}&state=closed'

def _iterate_by_page(org_name, repo_name):
page = 1
stop = False

while not stop:
print(f"Fetching pull request data for {org_name}/{repo_name} - page{page}")
url = _get_url(org_name, repo_name, page)

contributor_list = []
while True:
response = requests.get(url, headers=headers).json()

# get number of commits for each contributor
start_timestamp = None
for item in response:
num_commits_this_week = item['weeks'][-1]['c']
name = item['author']['login']
contributor = Contributor(name=name, num_commits_this_week=num_commits_this_week)
contributor_list.append(contributor)
if isinstance(response, list):
# sometimes the Github API returns nothing
# request again if the response is not a list
break
print("Empty response, request again...")

# update start_timestamp
start_timestamp = item['weeks'][-1]['w']
if len(response) == 0:
# if the response is empty, stop
stop = True
break

# count the pull request and author from response
for pr_data in response:
merged_at = pr_data['merged_at']
author = pr_data['user']['login']

if merged_at is None:
continue

merge_datetime = str2datetime(merged_at)

if merge_datetime < start_datetime:
# if we found a pull request that is merged before the start_datetime
# we stop
stop = True
break
else:
# record the author1
counter.record(author)

# next page
page += 1

for repo_name in repo_list:
_iterate_by_page(org_name, repo_name)

# convert unix timestamp to Beijing datetime
start_datetime = datetime.fromtimestamp(start_timestamp, tz=pytz.timezone('Asia/Shanghai'))
start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
bj_start_datetime = datetime.fromtimestamp(start_datetime.timestamp(), tz=pytz.timezone('Asia/Shanghai'))
bj_start_datetime_str = datetime2str(bj_start_datetime)

# sort by number of commits
contributor_list.sort(key=lambda x: x.num_commits_this_week, reverse=True)
contribution_list = counter.to_sorted_list()

# remove contributors who has zero commits
contributor_list = [x for x in contributor_list if x.num_commits_this_week > 0]

# prepare the data for plotting
x = [x.num_commits_this_week for x in contributor_list]
y = [x.name for x in contributor_list]
author_list = [x[0] for x in contribution_list]
num_commit_list = [x[1] for x in contribution_list]

# plot
if len(x) > 0:
xlabel = f"Number of Commits (since {start_datetime_str})"
if len(author_list) > 0:
xlabel = f"Number of Pull Requests (since {bj_start_datetime_str})"
ylabel = "Contributor"
title = 'Active Contributor Leaderboard'
plot_bar_chart(x, y, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
plot_bar_chart(num_commit_list, author_list, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
return True
else:
return False
Expand Down Expand Up @@ -438,10 +520,14 @@ def send_message_to_lark(message: str, webhook_url: str):
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
CONTRIBUTOR_IMAGE_PATH = 'contributor_leaderboard.png'
USER_ENGAGEMENT_IMAGE_PATH = 'engagement_leaderboard.png'
ORG_NAME = "hpcaitech"

# get all open source repositories
REPO_LIST = get_organization_repositories(GITHUB_TOKEN, ORG_NAME)

# generate images
contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, CONTRIBUTOR_IMAGE_PATH)
engagement_success = generate_user_engagement_leaderboard_image(GITHUB_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, ORG_NAME, REPO_LIST, CONTRIBUTOR_IMAGE_PATH)
engagement_success = generate_user_engagement_leaderboard_image(GITHUB_TOKEN, ORG_NAME, REPO_LIST, USER_ENGAGEMENT_IMAGE_PATH)

# upload images
APP_ID = os.environ['LARK_APP_ID']
Expand All @@ -457,8 +543,8 @@ def send_message_to_lark(message: str, webhook_url: str):
2. 用户互动榜单

注:
- 开发贡献者测评标准为:本周由公司成员提交的commit次数
- 用户互动榜单测评标准为:本周由公司成员在非成员创建的issue/PR/discussion中回复的次数
- 开发贡献者测评标准为:本周由公司成员与社区在所有开源仓库提交的Pull Request次数
- 用户互动榜单测评标准为:本周由公司成员在非成员在所有开源仓库创建的issue/PR/discussion中回复的次数
"""

send_message_to_lark(message, LARK_WEBHOOK_URL)
Expand All @@ -467,7 +553,7 @@ def send_message_to_lark(message: str, webhook_url: str):
if contrib_success:
send_image_to_lark(contributor_image_key, LARK_WEBHOOK_URL)
else:
send_message_to_lark("本周没有成员贡献commit,无榜单图片生成。", LARK_WEBHOOK_URL)
send_message_to_lark("本周没有成员贡献PR,无榜单图片生成。", LARK_WEBHOOK_URL)

# send user engagement image to lark
if engagement_success:
Expand Down
4 changes: 2 additions & 2 deletions applications/Chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ More details can be found in the latest news.
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
</p>

> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --max_timesteps 1 --update_timesteps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32
> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --num_collect_steps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32

## Install

Expand Down Expand Up @@ -287,7 +287,7 @@ If you only have a single 24G GPU, you can use the following script. `batch_size
torchrun --standalone --nproc_per_node=1 train_sft.py \
--pretrain "/path/to/LLaMa-7B/" \
--model 'llama' \
--strategy naive \
--strategy ddp \
--log_interval 10 \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
Expand Down
Loading