LigUnity / active_learning_scripts /run_cycle_one_model.py

提交LigUnity初始代码

94391f2 5 months ago

9.23 kB

	import pandas as pd
	import numpy as np
	import subprocess
	import os
	from pathlib import Path
	import random
	import argparse
	import json


	def parse_arguments():
	parser = argparse.ArgumentParser(description='Active Learning Cycle for Ligand Prediction')

	# Input/Output arguments
	parser.add_argument('--input_file', type=str, required=True,
	help='Input CSV file containing ligand data (e.g., tyk2_fep.csv)')
	parser.add_argument('--results_dir', type=str, required=True,
	help='Base directory for storing all results')
	parser.add_argument('--al_batch_size', type=int, required=True,
	help='Number of samples for each active learning batch')

	# Experiment configuration
	parser.add_argument('--num_repeats', type=int, default=5,
	help='Number of repeated experiments (default: 5)')
	parser.add_argument('--num_cycles', type=int, required=True,
	help='Number of active learning cycles')

	# Model configuration
	parser.add_argument('--arch', type=str, required=True,
	help='Model architecture')
	parser.add_argument('--weight_path', type=str, required=True,
	help='Path to pretrained model weights')
	parser.add_argument('--lr', type=float, default=0.001,
	help='Learning rate (default: 0.001)')
	parser.add_argument('--master_port', type=int, default=29500,
	help='Master port for distributed training (default: 29500)')
	parser.add_argument('--device', type=int, default=0,
	help='Device to run the model on (default: cuda:0)')
	parser.add_argument('--begin_greedy', type=int, default=0,
	help='iter of begin to be pure greedy, using half greedy before')

	# Random seed
	parser.add_argument('--base_seed', type=int, default=42,
	help='Base random seed (default: 42)')

	return parser.parse_args()


	def run_model(arch, weight_path, results_path, result_file, lr, master_port, train_ligf, test_ligf, device):
	import os
	project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	cmd = [
	"bash", "./active_learning_scripts/run_model.sh",
	arch,
	weight_path,
	results_path,
	result_file,
	str(lr),
	str(master_port),
	train_ligf,
	test_ligf,
	str(device)
	]
	subprocess.run(cmd, check=True, cwd=project_root)


	def prepare_initial_split(input_file, results_dir, al_batch_size, repeat_idx, cycle_idx, base_seed):
	# Read all ligands
	df = pd.read_csv(input_file)

	# Set random seed for reproducibility
	random.seed(base_seed + repeat_idx) # Different seed for each repeat

	# Randomly select ligands for training and testing
	all_indices = list(range(len(df)))
	train_indices = random.sample(all_indices, al_batch_size)
	test_indices = [i for i in all_indices if i not in train_indices]

	# Create train and test files
	train_df = df.iloc[train_indices]
	test_df = df.iloc[test_indices]

	# Create file names with repeat and cycle information
	train_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
	test_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")

	# Create directory if it doesn't exist
	os.makedirs(os.path.dirname(train_file), exist_ok=True)

	# Save files
	train_df.to_csv(train_file, index=False)
	test_df.to_csv(test_file, index=False)

	return train_file, test_file


	def read_jsonl_predictions(results_path, result_file):
	"""
	Read predictions from jsonl file and calculate average predictions
	Returns a dictionary mapping SMILES to average predictions
	"""
	predictions = {}
	all_predictions = []
	smiles_list = None

	jsonl_path = os.path.join(results_path, result_file)
	with open(jsonl_path, 'r') as f:
	# Read first line to get SMILES list
	first_line = f.readline()
	smiles_list = json.loads(first_line.strip())["tyk2"]["smiles"]

	# Read rest of lines containing predictions
	for line in f:
	pred_line = json.loads(line.strip())
	all_predictions.append(pred_line["tyk2"]["pred"])

	# Convert to numpy array for easier computation
	pred_array = np.array(all_predictions)
	# Calculate mean predictions
	mean_predictions = np.mean(pred_array, axis=0)

	# Create dictionary mapping SMILES to average predictions
	for smile, pred in zip(smiles_list, mean_predictions):
	predictions[smile] = float(pred)

	return predictions


	def update_splits(results_dir, results_path, result_file, prev_train_file, prev_test_file, repeat_idx, cycle_idx,
	al_batch_size, begin_greedy):
	# Read predictions from jsonl file
	predictions = read_jsonl_predictions(results_path, result_file)

	# Read previous test file
	test_df = pd.read_csv(prev_test_file)

	# Add predictions to test_df
	test_df['prediction'] = test_df['Smiles'].map(predictions)

	# Sort by predictions (high to low)
	test_df_sorted = test_df.sort_values('prediction', ascending=False)

	# Read previous train file
	train_df = pd.read_csv(prev_train_file)

	# Create new file names
	new_train_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
	new_test_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")

	# Create directory if it doesn't exist
	os.makedirs(os.path.dirname(new_train_file), exist_ok=True)

	if cycle_idx >= begin_greedy:
	# Take top al_batch_size compounds for training
	new_train_compounds = test_df_sorted.head(al_batch_size)
	remaining_test_compounds = test_df_sorted.iloc[al_batch_size:]
	else:
	# use half greedy approach
	new_train_compounds_tmp_1 = test_df_sorted.head(al_batch_size//2)
	remaining_test_compounds_tmp = test_df_sorted.iloc[al_batch_size//2:]
	all_indices = list(range(len(remaining_test_compounds_tmp)))

	train_indices = random.sample(all_indices, al_batch_size - al_batch_size//2)
	test_indices = [i for i in all_indices if i not in train_indices]
	remaining_test_compounds = remaining_test_compounds_tmp.iloc[test_indices]
	new_train_compounds_tmp_2 = remaining_test_compounds_tmp.iloc[train_indices]
	new_train_compounds = pd.concat([new_train_compounds_tmp_1, new_train_compounds_tmp_2])


	# Combine with previous training data
	combined_train_df = pd.concat([train_df, new_train_compounds])

	for _ in range(3):
	print("########################################")
	print("Cycling: ", cycle_idx)
	print("top_1p: {}/100".format(combined_train_df['top_1p'].sum()))
	print("top_2p: {}/200".format(combined_train_df['top_2p'].sum()))
	print("top_5p: {}/500".format(combined_train_df['top_5p'].sum()))

	# Save files
	combined_train_df.to_csv(new_train_file, index=False)
	remaining_test_compounds.to_csv(new_test_file, index=False)

	return new_train_file, new_test_file


	def run_active_learning(args):
	# Create base results directory
	os.system(f"rm -rf {args.results_dir}")
	os.makedirs(args.results_dir, exist_ok=True)

	for repeat_idx in range(args.num_repeats):
	print(f"Starting repeat {repeat_idx}")

	# Initial split for this repeat
	train_file, test_file = prepare_initial_split(
	args.input_file,
	args.results_dir,
	args.al_batch_size,
	repeat_idx,
	0, # First cycle
	args.base_seed
	)

	for cycle_idx in range(args.num_cycles):
	print(f"Running cycle {cycle_idx} for repeat {repeat_idx}")

	# Create results directory for this cycle
	results_path = args.results_dir

	# Result file name
	result_file = f"repeat_{repeat_idx}_cycle_{cycle_idx}_results.jsonl"
	if os.path.exists(f"{args.results_dir}/{result_file}"):
	os.remove(f"{args.results_dir}/{result_file}")

	# Run the model
	run_model(
	arch=args.arch,
	weight_path=args.weight_path,
	results_path=results_path,
	result_file=result_file,
	lr=args.lr,
	master_port=args.master_port,
	train_ligf=train_file,
	test_ligf=test_file,
	device=args.device
	)

	# Update splits for next cycle
	if cycle_idx < args.num_cycles - 1: # Don't update after last cycle
	train_file, test_file = update_splits(
	args.results_dir,
	results_path,
	result_file,
	train_file,
	test_file,
	repeat_idx,
	cycle_idx + 1,
	args.al_batch_size,
	args.begin_greedy
	)


	if __name__ == "__main__":
	args = parse_arguments()
	run_active_learning(args)