Homework6

读取data中2023_6文件夹中的openrank数据集，分析美国排名前一百的项目的的value的最大值、最小值、均值以及中位数。

In [1]:

Copied!





import pandas as pd
openrank = pd.read_csv('data/2023_6/open_rank_20236.csv')

maxx = openrank['value'].max()
minn = openrank['value'].min()
mean1 = openrank['value'].mean()
median1 = openrank['value'].median()

print('最大值：', maxx)
print('最小值：', minn)
print('平均值：', mean1)
print('中位数：', median1)
import pandas as pd
openrank = pd.read_csv('data/2023_6/open_rank_20236.csv')

maxx = openrank['value'].max()
minn = openrank['value'].min()
mean1 = openrank['value'].mean()
median1 = openrank['value'].median()

print('最大值：', maxx)
print('最小值：', minn)
print('平均值：', mean1)
print('中位数：', median1)

最大值： 1394.45
最小值： 200.68
平均值： 346.9679
中位数： 273.66999999999996

读取data中2022文件夹下的activity_2020文件，分析美国排名前十的项目的平均增长率。

In [3]:

Copied!





df = pd.read_csv('data/2022/activity_2022.csv').head(10)
growth_rates = []

for index, row in df.iterrows():
    monthly_values = row[1:].values
    monthly_growth = [(monthly_values[i+1] - monthly_values[i]) / monthly_values[i] for i in range(len(monthly_values) - 1)]
    average_growth = sum(monthly_growth) / len(monthly_growth)
    growth_rates.append(average_growth)

average_growth_rate = 100 * sum(growth_rates) / len(growth_rates)
print("平均增长率:", average_growth_rate)
df = pd.read_csv('data/2022/activity_2022.csv').head(10)
growth_rates = []

for index, row in df.iterrows():
    monthly_values = row[1:].values
    monthly_growth = [(monthly_values[i+1] - monthly_values[i]) / monthly_values[i] for i in range(len(monthly_values) - 1)]
    average_growth = sum(monthly_growth) / len(monthly_growth)
    growth_rates.append(average_growth)

average_growth_rate = 100 * sum(growth_rates) / len(growth_rates)
print("平均增长率:", average_growth_rate)

平均增长率: 1.2502691855077124

data/2022/china_2022.csv表示中国开源领域排名前十的企业。data/2022/global_2022.csv表示开源领域全球前十的的企业，请通过各种统计指标比较两者的各种数据差异。

In [4]:

Copied!





import pandas as pd

china_data = pd.read_csv('data/2022/china_2022.csv')
global_data = pd.read_csv('data/2022/global_2022.csv')

china_stats = china_data.describe()
global_stats = global_data.describe()

print("中国开源企业统计数据:")
print(china_stats)

print("\n全球开源企业统计数据:")
print(global_stats)
import pandas as pd

china_data = pd.read_csv('data/2022/china_2022.csv')
global_data = pd.read_csv('data/2022/global_2022.csv')

china_stats = china_data.describe()
global_stats = global_data.describe()

print("中国开源企业统计数据:")
print(china_stats)

print("\n全球开源企业统计数据:")
print(global_stats)

中国开源企业统计数据:
       issue_comment    open_issue     open_pull  review_comment  \
count      10.000000     10.000000     10.000000        10.00000   
mean    61205.500000   9169.200000  16912.700000     19857.50000   
std     49332.487917   6810.653563  10068.922805     21115.07349   
min     11741.000000    752.000000   1823.000000      2113.00000   
25%     30943.500000   4625.250000   9376.750000      3120.00000   
50%     39141.000000   6889.500000  15953.000000     10851.00000   
75%     82179.500000  14225.500000  22483.250000     31507.75000   
max    167814.000000  22397.000000  35266.000000     60402.00000   

        merged_pull      rank         value  rankDelta    valueDelta  
count     10.000000  10.00000      10.00000  10.000000     10.000000  
mean   13764.100000   5.50000   40269.53400   5.300000   9265.007000  
std     7692.752064   3.02765   30905.17181   9.944848   5327.824003  
min     1165.000000   1.00000   12033.71000   0.000000   2329.360000  
25%     8067.250000   3.25000   15161.03750   0.000000   5268.947500  
50%    13705.500000   5.50000   29789.23500   0.000000   9882.000000  
75%    18216.250000   7.75000   58554.96250   3.250000  10666.070000  
max    26732.000000  10.00000  103368.49000  25.000000  21093.110000  

全球开源企业统计数据:
       issue_comment     open_issue      open_pull  review_comment  \
count   1.000000e+01      10.000000      10.000000       10.000000   
mean    3.414094e+05   43300.200000   83423.000000   120392.100000   
std     4.226103e+05   54001.071938   87658.295125   128239.763444   
min     7.853000e+04   13162.000000   27414.000000    35072.000000   
25%     9.780075e+04   16630.500000   32980.000000    49480.000000   
50%     1.786290e+05   21648.000000   49103.500000    70433.500000   
75%     3.156160e+05   38010.500000  102867.000000   148395.000000   
max     1.437317e+06  189185.000000  309685.000000   456166.000000   

         merged_pull      rank          value  rankDelta    valueDelta  
count      10.000000  10.00000      10.000000  10.000000     10.000000  
mean    62472.000000   5.50000  215855.491000   0.100000  11906.921000  
std     73265.180035   3.02765  235189.889662   0.994429  26288.856142  
min     15418.000000   1.00000   71636.820000  -2.000000 -47388.580000  
25%     22151.500000   3.25000   89080.312500   0.000000   7749.432500  
50%     33288.000000   5.50000  102790.850000   0.000000  14209.235000  
75%     75093.750000   7.75000  252184.732500   1.000000  22235.555000  
max    257123.000000  10.00000  824848.670000   1.000000  57536.090000

In [6]:

Copied!





import numpy as np
import matplotlib.pyplot as plt

columns_to_analyze = ['issue_comment', 'open_issue', 'open_pull', 'review_comment', 'merged_pull', 'value', 'rankDelta', 'valueDelta']

china_stats = china_data[columns_to_analyze].agg(['mean', 'max', 'min', 'median']).transpose()
china_stats.columns = ['China_Mean', 'China_Max', 'China_Min', 'China_Median']
global_stats = global_data[columns_to_analyze].agg(['mean', 'max', 'min', 'median']).transpose()
global_stats.columns = ['Global_Mean', 'Global_Max', 'Global_Min', 'Global_Median']
comparison_stats = pd.concat([china_stats, global_stats], axis=1)

print("可视化对比：")

for stat in ['Mean', 'Max', 'Min', 'Median']:
    plt.figure(figsize=(12, 6))

    china_values = comparison_stats[f'China_{stat}']
    global_values = comparison_stats[f'Global_{stat}']
    metrics = comparison_stats.index

    bar_width = 0.5
    index = np.arange(len(metrics))
    
    china_bars = plt.bar(index, china_values, bar_width, label=f'China {stat}', alpha=0.7)
    global_bars = plt.bar(index + bar_width, global_values, bar_width, label=f'Global {stat}', alpha=0.7)

    for bar in china_bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.1f}', ha='center', va='bottom')
    
    for bar in global_bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.1f}', ha='center', va='bottom')

    plt.xlabel('items')
    plt.ylabel(stat)
    plt.title(f'China vs Global: {stat} comparison')
    plt.xticks(index + bar_width / 2, metrics, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()
import numpy as np
import matplotlib.pyplot as plt

columns_to_analyze = ['issue_comment', 'open_issue', 'open_pull', 'review_comment', 'merged_pull', 'value', 'rankDelta', 'valueDelta']

china_stats = china_data[columns_to_analyze].agg(['mean', 'max', 'min', 'median']).transpose()
china_stats.columns = ['China_Mean', 'China_Max', 'China_Min', 'China_Median']
global_stats = global_data[columns_to_analyze].agg(['mean', 'max', 'min', 'median']).transpose()
global_stats.columns = ['Global_Mean', 'Global_Max', 'Global_Min', 'Global_Median']
comparison_stats = pd.concat([china_stats, global_stats], axis=1)

print("可视化对比：")

for stat in ['Mean', 'Max', 'Min', 'Median']:
    plt.figure(figsize=(12, 6))

    china_values = comparison_stats[f'China_{stat}']
    global_values = comparison_stats[f'Global_{stat}']
    metrics = comparison_stats.index

    bar_width = 0.5
    index = np.arange(len(metrics))
    
    china_bars = plt.bar(index, china_values, bar_width, label=f'China {stat}', alpha=0.7)
    global_bars = plt.bar(index + bar_width, global_values, bar_width, label=f'Global {stat}', alpha=0.7)

    for bar in china_bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.1f}', ha='center', va='bottom')
    
    for bar in global_bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.1f}', ha='center', va='bottom')

    plt.xlabel('items')
    plt.ylabel(stat)
    plt.title(f'China vs Global: {stat} comparison')
    plt.xticks(index + bar_width / 2, metrics, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

可视化对比：

No description has been provided for this image

贝叶斯定理

贝叶斯定理参考：https://zh.wikipedia.org/wiki/%E8%B4%9D%E5%8F%B6%E6%96%AF%E5%AE%9A%E7%90%86

根据 OpenLeaderboard 上对前 10000 个活跃的项目统计，工具组件型项目占比 50 %，系统应用型占比 25 %，而内容资源型（非软件类）项目占比 25 %，成三分天下的态势。

非软件类项目中，带有 HTML/Markdown 标签的项目占 85 %，而软件类项目中带 HTML/Markdown标签的项目占比则为 10 %（注：HTML/Markdown 一般可用来书写文档内容）

工具组件型项目中，JavaScript 语言的项目占比 35 %，而非工具组件型项目中， JavaScript 语言的项目占比则为 10 %（注：JavaScript 是一种脚本编程语言，可以在网页上实现复杂的功能）

已知一个项目带有 HTML/Markdown 标签，那么该项目是非软件型项目的概率是多少？

A 是事件“该项目是非软件型项目”。 B 是事件“该项目带有 HTML/Markdown 标签”。我们需要计算 P(A∣B)，即“已知项目带有 HTML/Markdown 标签，该项目是非软件型项目的概率”。

已知P(A)：非软件类项目的先验概率为 25%。 P(B∣A)：非软件类项目中带有 HTML/Markdown 标签的概率为 85%。 $ P(B∣A^{c})$ ：软件类项目（工具组件型和系统应用型）中带有 HTML/Markdown 标签的概率为 10%。 $ P(A^{c}) $：软件类项目的先验概率是 75%（工具组件型和系统应用型分别占 50% 和 25%）。

$P(B)=P(B∣A)⋅P(A)+P(B∣A^{c})⋅P(A^{c})$

代入： $P(B)=0.85⋅0.25+0.10⋅0.75$

然后代入贝叶斯公式计算 P(A∣B)： $P(A∣B)=\frac{P(B∣A)⋅P(A)}{P(B)} = \frac{0.85⋅0.25}{0.85⋅0.25+0.10⋅0.75}=0.7391$

接上文，已知一个项目是由 JavaScript 语言编写的，那么它是工具组件型项目的概率是多少？

事件C: 一个项目是工具组件型项目

事件D: 一个项目是由JS编写的

$P(C) = 0.50, P(\bar{C}) = 0.50; P(D|C) = 0.35, P(D|\bar{C}) = 0.10$

由贝叶斯定理，

$P(C|D) = \frac{P(C)P(D|C)}{P(D)} = \frac{P(C)P(D|C)}{P(C)P(D|C)+P(\bar{C})P(D|\bar{C})} = \frac{0.175}{0.175+0.05} = 0.777778$

根据以下数据建立可视化无向图

user = [1, 2, 3, 4]

edge = [(1, 2), (2, 3), (3, 4), (4, 1)]

In [10]:

Copied!





import networkx as nx
import matplotlib.pyplot as plt

nodes = [1, 2, 3, 4]
edges = [(1, 2), (2, 3), (3, 4), (4, 1)]

G = nx.Graph()

for node in nodes:
    G.add_node(node)
G.add_edges_from(edges)
nx.draw_networkx(G, with_labels=True, node_color='green')
plt.show()
import networkx as nx
import matplotlib.pyplot as plt

nodes = [1, 2, 3, 4]
edges = [(1, 2), (2, 3), (3, 4), (4, 1)]

G = nx.Graph()

for node in nodes:
    G.add_node(node)
G.add_edges_from(edges)
nx.draw_networkx(G, with_labels=True, node_color='green')
plt.show()

根据以下数据建立可视化有向图

users = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

edges = [(0, 1), (1, 0), (0, 2), (2, 0), (1, 2), (2, 1), (1, 3), (2, 3), (3, 4), (5, 4), (5, 6), (7, 5), (6, 8), (8, 7), (8, 9)]

In [11]:

Copied!





nodes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
edges = [(0, 1), (1, 0), (0, 2), (2, 0), (1, 2),
            (2, 1), (1, 3), (2, 3), (3, 4), (5, 4),
            (5, 6), (7, 5), (6, 8), (8, 7), (8, 9)]
G = nx.DiGraph()

for node in nodes:
    G.add_node(node)
G.add_edges_from(edges)

pos = [(1, 3), (2, 4), (2, 2),  (2, 1),  (3, 3),  (4, 1),  (5, 4),  (5, 2),  (6, 3), (7, 4)] 

nx.draw_networkx(G, pos, with_labels=True, node_color='red')
plt.show()
nodes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
edges = [(0, 1), (1, 0), (0, 2), (2, 0), (1, 2),
            (2, 1), (1, 3), (2, 3), (3, 4), (5, 4),
            (5, 6), (7, 5), (6, 8), (8, 7), (8, 9)]
G = nx.DiGraph()

for node in nodes:
    G.add_node(node)
G.add_edges_from(edges)

pos = [(1, 3), (2, 4), (2, 2),  (2, 1),  (3, 3),  (4, 1),  (5, 4),  (5, 2),  (6, 3), (7, 4)] 

nx.draw_networkx(G, pos, with_labels=True, node_color='red')
plt.show()

针对第七题构建的有向图，计算并输出每个节点的pagerank值。同时根据pagerank调整可视化图的大小，使得PageRank越大的节点在可视化结果中也越大。 pageRank算法原理：https://zh.wikipedia.org/wiki/PageRank

In [13]:

Copied!





d = 0.9
N = len(G.nodes())  # 节点数
max_iter = 30  # 最大迭代次数
tol = 1e-5  # 收敛阈值

# 初始化每个节点的 PageRank 值
pagerank = {node: 1 / N for node in G.nodes()}

for i in range(max_iter):
    new_pagerank = {}
    for node in G.nodes():
        # 从其他节点传递到当前节点的 PageRank 值之和
        rank_sum = 0
        for neighbor in G.predecessors(node):
            rank_sum += pagerank[neighbor] / G.out_degree(neighbor)

        new_pagerank[node] = (1 - d) / N + d * rank_sum

    # 检查收敛
    diff = sum(abs(new_pagerank[node] - pagerank[node]) for node in G.nodes())
    pagerank = new_pagerank
    if diff < tol:
        print(f"经过 {i + 1} 迭代后得到结果")
        break

# 指数级缩放节点大小
node_sizes = [np.log(pagerank[node] + 1)  * 50000  for node in G.nodes()]

# 可视化
plt.figure(figsize=(10, 8))
pos = [(1, 3), (2, 4), (2, 2),  (2, 1),  (3, 3),  (4, 1),  (5, 4),  (5, 2),  (6, 3), (7, 4)] 
nx.draw_networkx(
    G, 
    pos, 
    with_labels=True, 
    node_size=node_sizes, 
    node_color='yellow', 
)
plt.title("pagerank")
plt.show()
d = 0.9
N = len(G.nodes())  # 节点数
max_iter = 30  # 最大迭代次数
tol = 1e-5  # 收敛阈值

# 初始化每个节点的 PageRank 值
pagerank = {node: 1 / N for node in G.nodes()}

for i in range(max_iter):
    new_pagerank = {}
    for node in G.nodes():
        # 从其他节点传递到当前节点的 PageRank 值之和
        rank_sum = 0
        for neighbor in G.predecessors(node):
            rank_sum += pagerank[neighbor] / G.out_degree(neighbor)

        new_pagerank[node] = (1 - d) / N + d * rank_sum

    # 检查收敛
    diff = sum(abs(new_pagerank[node] - pagerank[node]) for node in G.nodes())
    pagerank = new_pagerank
    if diff < tol:
        print(f"经过 {i + 1} 迭代后得到结果")
        break

# 指数级缩放节点大小
node_sizes = [np.log(pagerank[node] + 1)  * 50000  for node in G.nodes()]

# 可视化
plt.figure(figsize=(10, 8))
pos = [(1, 3), (2, 4), (2, 2),  (2, 1),  (3, 3),  (4, 1),  (5, 4),  (5, 2),  (6, 3), (7, 4)] 
nx.draw_networkx(
    G, 
    pos, 
    with_labels=True, 
    node_size=node_sizes, 
    node_color='yellow', 
)
plt.title("pagerank")
plt.show()

经过 27 迭代后得到结果