data——watsh

简介: data——watsh
import collections
import pandas as pd
import networkx as nx
from tqdm import tqdm
import numpy as np
# Function to read and rename excel based on the new data structure
def read_and_rename_excel(file_path):
    df = pd.read_excel(file_path)
    # Rename the columns according to your requirement
    df.columns = ['stock_code', 'observed_firm', 'year', 'news_observed_direct', 'exploration_observed',
                  'exploitation_observed',
                  'direct_firm', 'news_direct_indirect', 'exploration_direct', 'exploitation_direct', 'indirect_firm',
                  'exploration_strategy_observed', 'exploitation_strategy_observed', 'exploration_strategy_direct',
                  'exploitation_strategy_direct',
                  'strategy_similarity', 'strategy_similarity_yearly', 'count', 'strategy_similarity_vector',
                  'strategy_similarity_vector2', 'strategy_similarity_Euclidean']
    # Add empty columns for network structure measures and centrality measures
    network_structure_vars = ['density', 'centrality', 'average_clustering_coefficient', 'degree_distribution_mean',
                              'degree_distribution_std',
                              'average_path_length', 'network_stability', 'modularity']
    centrality_vars = ['degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'eigenvector_centrality',
                       'pagerank']
    for var in network_structure_vars + centrality_vars:
        df[var] = np.nan
    return df
# Function to build network with edge weights based on exploration and exploitation frequencies
def build_network(data, year, observed_firm):
    network = nx.Graph()
    year_data = data[(data['year'] == year) & (data['observed_firm'] == observed_firm)]
    for _, row in year_data.iterrows():
        observed_firm = row['observed_firm']
        direct_firm = row['direct_firm']
        weight_1 = row['exploration_observed'] / row['exploitation_observed'] if row[
                                                                                     'exploitation_observed'] != 0 else 0
        network.add_edge(observed_firm, direct_firm, weight=weight_1)
        indirect_firm = row['indirect_firm']
        if pd.notna(indirect_firm):
            weight_2 = row['exploration_direct'] / row['exploitation_direct'] if row['exploitation_direct'] != 0 else 0
            network.add_edge(direct_firm, indirect_firm, weight=weight_2)
    return network
# Function to calculate network centrality measures
def calculate_centrality_measures(network, observed_firm):
    centrality_measures = {}
    centrality_measures['degree_centrality'] = nx.degree_centrality(network)[observed_firm]
    centrality_measures['closeness_centrality'] = nx.closeness_centrality(network)[observed_firm]
    centrality_measures['betweenness_centrality'] = nx.betweenness_centrality(network)[observed_firm]
    centrality_measures['eigenvector_centrality'] = nx.eigenvector_centrality(network, tol=1e-6, max_iter=1000)[
        observed_firm]
    centrality_measures['pagerank'] = nx.pagerank(network)[observed_firm]
    return centrality_measures
# Function to calculate network structure measures
def calculate_network_structure_measures(network):
    network_structure_measures = {}
    network_structure_measures['density'] = nx.density(network)
    node_centrality = nx.degree_centrality(network)
    network_structure_measures['centrality'] = sum(node_centrality.values()) / len(node_centrality)
    network_structure_measures['average_clustering_coefficient'] = nx.average_clustering(network)
    degree_sequence = sorted([d for n, d in network.degree()], reverse=True)
    degreeCount = collections.Counter(degree_sequence)
    deg, cnt = zip(*degreeCount.items())
    network_structure_measures['degree_distribution_mean'] = pd.Series(deg).mean()
    network_structure_measures['degree_distribution_std'] = pd.Series(deg).std()
    giant = max(nx.connected_components(network), key=len)
    network_structure_measures['average_path_length'] = nx.average_shortest_path_length(network.subgraph(giant))
    laplacian = nx.laplacian_matrix(network).toarray()
    eigenvalues = np.linalg.eigvals(laplacian)
    network_structure_measures['network_stability'] = sorted(eigenvalues)[1]
    communities = nx.algorithms.community.greedy_modularity_communities(network)
    network_structure_measures['modularity'] = nx.algorithms.community.modularity(network, communities)
    return network_structure_measures
# Read the data
file_path = "/Users/nicole/Desktop/EXCEL数据/final_data.xlsx"
df = read_and_rename_excel(file_path)
# Create the networks for each year and observed firm
observed_firms_by_year = df.groupby('year')['observed_firm'].unique().to_dict()
networks = {year: [build_network(df, year, observed_firm) for observed_firm in
                   tqdm(observed_firms, desc=f"Processing year {year}")]
            for year, observed_firms in observed_firms_by_year.items()}
# Calculate the centrality measures and network structure measures for each network
for year, networks_in_year in networks.items():
    for network in tqdm(networks_in_year, desc=f"Processing network measures for year {year}"):
        network_structure_measures = calculate_network_structure_measures(network)
        for observed_firm in network.nodes:
            centrality_measures = calculate_centrality_measures(network, observed_firm)
            df.loc[(df['year'] == year) & (df['observed_firm'] == observed_firm), ['density', 'centrality',
                                                                                   'average_clustering_coefficient',
                                                                                   'degree_distribution_mean',
                                                                                   'degree_distribution_std',
                                                                                   'average_path_length',
                                                                                   'network_stability',
                                                                                   'modularity']] = list(
                network_structure_measures.values())
            df.loc[(df['year'] == year) & (df['observed_firm'] == observed_firm), ['degree_centrality',
                                                                                   'closeness_centrality',
                                                                                   'betweenness_centrality',
                                                                                   'eigenvector_centrality',
                                                                                   'pagerank']] = list(
                centrality_measures.values())
# Save the updated DataFrame to a new Excel file
output_file_path = "/Users/nicole/Desktop/EXCEL数据/final_data_networkcaclulation.xlsx"
df.to_excel(output_file_path, index=False)
print(f"Results saved to {output_file_path}")
目录
相关文章
|
6月前
|
Android开发
|
6月前
|
Java Spring
@AllArgsConstructor,@NoArgsConstructor,@Data
@AllArgsConstructor,@NoArgsConstructor,@Data
156 0
|
分布式计算 JavaScript 前端开发
DATA-X和DATA-V
DATA-X和DATA-V
217 2
|
机器学习/深度学习 运维 算法
Data to be mined| 学习笔记
快速学习 Data to be mined。
165 0
Data to be mined| 学习笔记
|
移动开发
H5 data-*容易忽略的问题
H5 data-*容易忽略的问题
103 0
H5 data-*容易忽略的问题
D3 dataset - what is usage of key function in data
Created by Wang, Jerry, last modified on Sep 21, 2015
119 0
D3 dataset - what is usage of key function in data
|
数据可视化 API 数据挖掘
|
存储 NoSQL 分布式数据库
带你玩转 Big Data
Big Data(大数据)技术简析 Big Data是近来的一个技术热点,但从名字就能判断它并不是什么新词。毕竟,大是一个相对概念。历史上,数据库、数据仓库、数据集市等信息管理领域的技术,很大程度上也是为了解决大规模数据的问题。
1772 0
|
开发工具
R-Organize Data(step 2)
R is a data analysis and visualization platform.
936 0
|
资源调度
R-Description Data(step 3)
R is a data analysis and visualization platform.
1099 0