import collections import pandas as pd import networkx as nx from tqdm import tqdm import numpy as np # Function to read and rename excel based on the new data structure def read_and_rename_excel(file_path): df = pd.read_excel(file_path) # Rename the columns according to your requirement df.columns = ['stock_code', 'observed_firm', 'year', 'news_observed_direct', 'exploration_observed', 'exploitation_observed', 'direct_firm', 'news_direct_indirect', 'exploration_direct', 'exploitation_direct', 'indirect_firm', 'exploration_strategy_observed', 'exploitation_strategy_observed', 'exploration_strategy_direct', 'exploitation_strategy_direct', 'strategy_similarity', 'strategy_similarity_yearly', 'count', 'strategy_similarity_vector', 'strategy_similarity_vector2', 'strategy_similarity_Euclidean'] # Add empty columns for network structure measures and centrality measures network_structure_vars = ['density', 'centrality', 'average_clustering_coefficient', 'degree_distribution_mean', 'degree_distribution_std', 'average_path_length', 'network_stability', 'modularity'] centrality_vars = ['degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'eigenvector_centrality', 'pagerank'] for var in network_structure_vars + centrality_vars: df[var] = np.nan return df # Function to build network with edge weights based on exploration and exploitation frequencies def build_network(data, year, observed_firm): network = nx.Graph() year_data = data[(data['year'] == year) & (data['observed_firm'] == observed_firm)] for _, row in year_data.iterrows(): observed_firm = row['observed_firm'] direct_firm = row['direct_firm'] weight_1 = row['exploration_observed'] / row['exploitation_observed'] if row[ 'exploitation_observed'] != 0 else 0 network.add_edge(observed_firm, direct_firm, weight=weight_1) indirect_firm = row['indirect_firm'] if pd.notna(indirect_firm): weight_2 = row['exploration_direct'] / row['exploitation_direct'] if row['exploitation_direct'] != 0 else 0 network.add_edge(direct_firm, indirect_firm, weight=weight_2) return network # Function to calculate network centrality measures def calculate_centrality_measures(network, observed_firm): centrality_measures = {} centrality_measures['degree_centrality'] = nx.degree_centrality(network)[observed_firm] centrality_measures['closeness_centrality'] = nx.closeness_centrality(network)[observed_firm] centrality_measures['betweenness_centrality'] = nx.betweenness_centrality(network)[observed_firm] centrality_measures['eigenvector_centrality'] = nx.eigenvector_centrality(network, tol=1e-6, max_iter=1000)[ observed_firm] centrality_measures['pagerank'] = nx.pagerank(network)[observed_firm] return centrality_measures # Function to calculate network structure measures def calculate_network_structure_measures(network): network_structure_measures = {} network_structure_measures['density'] = nx.density(network) node_centrality = nx.degree_centrality(network) network_structure_measures['centrality'] = sum(node_centrality.values()) / len(node_centrality) network_structure_measures['average_clustering_coefficient'] = nx.average_clustering(network) degree_sequence = sorted([d for n, d in network.degree()], reverse=True) degreeCount = collections.Counter(degree_sequence) deg, cnt = zip(*degreeCount.items()) network_structure_measures['degree_distribution_mean'] = pd.Series(deg).mean() network_structure_measures['degree_distribution_std'] = pd.Series(deg).std() giant = max(nx.connected_components(network), key=len) network_structure_measures['average_path_length'] = nx.average_shortest_path_length(network.subgraph(giant)) laplacian = nx.laplacian_matrix(network).toarray() eigenvalues = np.linalg.eigvals(laplacian) network_structure_measures['network_stability'] = sorted(eigenvalues)[1] communities = nx.algorithms.community.greedy_modularity_communities(network) network_structure_measures['modularity'] = nx.algorithms.community.modularity(network, communities) return network_structure_measures # Read the data file_path = "/Users/nicole/Desktop/EXCEL数据/final_data.xlsx" df = read_and_rename_excel(file_path) # Create the networks for each year and observed firm observed_firms_by_year = df.groupby('year')['observed_firm'].unique().to_dict() networks = {year: [build_network(df, year, observed_firm) for observed_firm in tqdm(observed_firms, desc=f"Processing year {year}")] for year, observed_firms in observed_firms_by_year.items()} # Calculate the centrality measures and network structure measures for each network for year, networks_in_year in networks.items(): for network in tqdm(networks_in_year, desc=f"Processing network measures for year {year}"): network_structure_measures = calculate_network_structure_measures(network) for observed_firm in network.nodes: centrality_measures = calculate_centrality_measures(network, observed_firm) df.loc[(df['year'] == year) & (df['observed_firm'] == observed_firm), ['density', 'centrality', 'average_clustering_coefficient', 'degree_distribution_mean', 'degree_distribution_std', 'average_path_length', 'network_stability', 'modularity']] = list( network_structure_measures.values()) df.loc[(df['year'] == year) & (df['observed_firm'] == observed_firm), ['degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'eigenvector_centrality', 'pagerank']] = list( centrality_measures.values()) # Save the updated DataFrame to a new Excel file output_file_path = "/Users/nicole/Desktop/EXCEL数据/final_data_networkcaclulation.xlsx" df.to_excel(output_file_path, index=False) print(f"Results saved to {output_file_path}")