-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerateFeeData.py
154 lines (136 loc) · 8.51 KB
/
generateFeeData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Author: James Key
# This script parses all the txs since block startBlockHeight until latestBlockHeight into a csv file, then another
# script will do the actual data processing into another csv file - it would take too much memory to
# hold an array of all the tx fee data. A potential issue with this script is that it doesn't check the integrity of
# past data. This would be an issue if this parser stopped running while having written only part of a block to the
# file - if it does so at block N, the parser won't fill in the missing data for block N, but will instead move on
# to writing block N+1. A potential solution would be to, upon this parser starting and reading the latest block height
# written, check if the number of lines (the number of txs) written for the block height in question is equal to the
# number of txs that it should contain by quering the noe RPC. If it's not equal, then copy over all the 'good' block
# data to a new csv file without the 'bad' blocks, and fill in the missing data as you go. However, since the vast
# majority of the time it takes to write a block to CSV (~=30s) is looking up input data from the node RPC, and the
# data writing itself for each block should be far less than 1s, it would be easiest to just check the integrity of the
# dataset once parsing has finished so this only ever needs to be done once. Also, if it seems like a block is missing,
# first check if it's an empty block - if it is, nothing will be written since coinbase txs are ignored since they have
# no fees themselves. It should also be noted that matching btc price data to a block depends on the miner-defined
# time that block was mined, which is unlikely to be inaccurate, but could be either due to an incorrect setting, or
# the miner performing some kind of attack on the network. However, the time is guaranteed to be within some range of
# the median time I think, with which the btc won't have fluctuated much, so in practicality it's not an issue
from bitcoinrpc.authproxy import AuthServiceProxy, JSONRPCException
import time
import configparser
import csv
import os
t0 = time.time()
# Read from the parser's conf file and connect to the node's RPC
conf = configparser.ConfigParser()
conf.read("btcparser.conf")
nodeCredentials = conf["config"]["nodeCredentials"]
node = AuthServiceProxy(nodeCredentials)
# Block from the start of 2017 is 446000
# Block height to start parsing from
startBlockHeight = 525000
# The last block to be parsed since it's before the end of our price data time-wise. The last block WON'T be included
latestBlockHeight = 549500
blocksParsed = 0
priceDataFileName = "bitcoin-historical-data/coinbaseUSD_10-min_data_2016-12-31_to_2018-11-11.csv"
feeDataFileName = "bitcoin-historical-data/txFeeDataFromBlock{}To{}.csv".format(startBlockHeight, latestBlockHeight)
# Open priceData file
with open(priceDataFileName) as priceData:
# Need to check if the feeData file exists so the header can be written if not
if not os.path.isfile(feeDataFileName):
with open(feeDataFileName, 'w') as feeData:
feeDataWriter = csv.writer(feeData)
feeDataWriter.writerow(["height", "time", "btcPrice", "txid", "txVSize", "btcSpent", "usdSpent", "feeInBtc",
"feeInUSD", "btcPerVByte", "USDPerVByte"])
# If it does exist, read the last block height that was written to it
else:
with open(feeDataFileName) as feeData:
feeDataReader = csv.reader(feeData, delimiter=',')
# Skip to the 2nd row where the price data actually starts
next(feeDataReader)
next(feeDataReader)
for row in feeDataReader:
if int(row[0]) >= startBlockHeight: startBlockHeight = int(row[0])+1
# Start editing the data
with open(feeDataFileName, 'a') as feeData:
feeDataWriter = csv.writer(feeData)
t1 = time.time()
# Iterate through blocks
for height in range(startBlockHeight, latestBlockHeight):
blocksParsed += 1
newBlock = time.time()
inputCount = 0
rpcLookupCount = 0
# Want to open priceData fresh every time since some block reported times can be before the previous block,
# which would cause this script to never find the right price data if only going forward in time after
# the previous block
with open(priceDataFileName) as priceData:
t2 = time.time()
priceDataReader = csv.reader(priceData, delimiter=',')
# Skip to the 2nd row where the price data actually starts
row = next(priceDataReader)
row = next(priceDataReader)
print()
print('Height: ', height)
# Get block data from the node
block = node.getblock(node.getblockhash(height))
# No point doing anything if it's an empty block
if len(block['tx']) > 1:
# This will be a 2d list - a list where each element is a list of data for an individual tx
txsData = []
# Find the price data that was the latest before the block was mined
while not (block['time'] >= (int(row[0])) and block['time'] <= int(row[0])+600):
row = next(priceDataReader)
btcPrice = float(row[7])
t3 = time.time()
# Iterate through every tx in a block. Don't want to include the coinbase tx when iterating through
# each tx in a block
for txid in block['tx'][1:]:
totalIn = 0
totalOut = 0
rpcLookupCount += 1
# Get data of a tx
tx = node.decoderawtransaction(node.getrawtransaction(txid), True)
try:
# Need the total Bitcoins being used as inputs
for input in tx['vin']:
inputCount += 1
rpcLookupCount += 1
# Need to know which index of an output of this tx is being spent
index = input['vout']
# The node only refers to the txid of the output that's being spent in this input, so need to fetch
# that first
txo = node.decoderawtransaction(node.getrawtransaction(input['txid']), True)
# Need to get the actual output of this tx being spent to get the amount
totalIn += txo['vout'][index]['value']
# Need the total Bitcoins being spent as outputs
for output in tx['vout']:
totalOut += output['value']
totalIn = float(totalIn)
totalOut = float(totalOut)
feeBtc = totalIn - totalOut
# Get a list of tx data in this block so they can be sorted by the fee/vByte
txsData.append([height, block['time'], btcPrice, txid, tx['vsize'], totalOut, totalOut*btcPrice, feeBtc,
feeBtc*btcPrice, feeBtc/tx['vsize'], feeBtc*btcPrice/(tx['vsize'])])
except Exception as e:
print('Error: ', e)
print(txid)
print(input)
t4 = time.time()
# Sort every tx in a block by the fee/vByte from smallest to largest
sortedTxsData = sorted(txsData, key=lambda x: x[9])
t5 = time.time()
# Write the sorted data into the csv file
for feeInfo in sortedTxsData:
feeDataWriter.writerow(feeInfo)
t6 = time.time()-t0
print("Time elapsed: {} s {} mins {} hours {} days".format(
t6, t6/60, t6/3600, t6/86400))
print("""Time to: open priceData = {}, find correct price = {}, look up fee data = {},
sort fee data = {} write fee data = {}, total for block = {}, number of inputs = {},
time per input = {}, rpcLookupCount = {}, time per rpc lookup = {},
blocksParsed = {}, average parse time per block = {}s""".format(
t2-newBlock, t3-t2, t4-t3, t5-t4, time.time()-t5, time.time()-newBlock, inputCount,
(time.time()-newBlock)/inputCount, rpcLookupCount, (time.time()-newBlock)/rpcLookupCount),
blocksParsed, t6/blocksParsed)